aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext3
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/ext3
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'fs/ext3')
-rw-r--r--fs/ext3/Makefile12
-rw-r--r--fs/ext3/acl.c547
-rw-r--r--fs/ext3/acl.h84
-rw-r--r--fs/ext3/balloc.c1600
-rw-r--r--fs/ext3/bitmap.c26
-rw-r--r--fs/ext3/dir.c519
-rw-r--r--fs/ext3/file.c131
-rw-r--r--fs/ext3/fsync.c88
-rw-r--r--fs/ext3/hash.c152
-rw-r--r--fs/ext3/ialloc.c794
-rw-r--r--fs/ext3/inode.c3132
-rw-r--r--fs/ext3/ioctl.c243
-rw-r--r--fs/ext3/namei.c2378
-rw-r--r--fs/ext3/resize.c996
-rw-r--r--fs/ext3/super.c2539
-rw-r--r--fs/ext3/symlink.c54
-rw-r--r--fs/ext3/xattr.c1320
-rw-r--r--fs/ext3/xattr.h135
-rw-r--r--fs/ext3/xattr_security.c55
-rw-r--r--fs/ext3/xattr_trusted.c65
-rw-r--r--fs/ext3/xattr_user.c79
21 files changed, 14949 insertions, 0 deletions
diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile
new file mode 100644
index 000000000000..704cd44a40c2
--- /dev/null
+++ b/fs/ext3/Makefile
@@ -0,0 +1,12 @@
1#
2# Makefile for the linux ext3-filesystem routines.
3#
4
5obj-$(CONFIG_EXT3_FS) += ext3.o
6
7ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o
9
10ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
11ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
12ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
new file mode 100644
index 000000000000..328592c3a956
--- /dev/null
+++ b/fs/ext3/acl.c
@@ -0,0 +1,547 @@
1/*
2 * linux/fs/ext3/acl.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */
6
7#include <linux/init.h>
8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/fs.h>
11#include <linux/ext3_jbd.h>
12#include <linux/ext3_fs.h>
13#include "xattr.h"
14#include "acl.h"
15
16/*
17 * Convert from filesystem to in-memory representation.
18 */
19static struct posix_acl *
20ext3_acl_from_disk(const void *value, size_t size)
21{
22 const char *end = (char *)value + size;
23 int n, count;
24 struct posix_acl *acl;
25
26 if (!value)
27 return NULL;
28 if (size < sizeof(ext3_acl_header))
29 return ERR_PTR(-EINVAL);
30 if (((ext3_acl_header *)value)->a_version !=
31 cpu_to_le32(EXT3_ACL_VERSION))
32 return ERR_PTR(-EINVAL);
33 value = (char *)value + sizeof(ext3_acl_header);
34 count = ext3_acl_count(size);
35 if (count < 0)
36 return ERR_PTR(-EINVAL);
37 if (count == 0)
38 return NULL;
39 acl = posix_acl_alloc(count, GFP_KERNEL);
40 if (!acl)
41 return ERR_PTR(-ENOMEM);
42 for (n=0; n < count; n++) {
43 ext3_acl_entry *entry =
44 (ext3_acl_entry *)value;
45 if ((char *)value + sizeof(ext3_acl_entry_short) > end)
46 goto fail;
47 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
48 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
49 switch(acl->a_entries[n].e_tag) {
50 case ACL_USER_OBJ:
51 case ACL_GROUP_OBJ:
52 case ACL_MASK:
53 case ACL_OTHER:
54 value = (char *)value +
55 sizeof(ext3_acl_entry_short);
56 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
57 break;
58
59 case ACL_USER:
60 case ACL_GROUP:
61 value = (char *)value + sizeof(ext3_acl_entry);
62 if ((char *)value > end)
63 goto fail;
64 acl->a_entries[n].e_id =
65 le32_to_cpu(entry->e_id);
66 break;
67
68 default:
69 goto fail;
70 }
71 }
72 if (value != end)
73 goto fail;
74 return acl;
75
76fail:
77 posix_acl_release(acl);
78 return ERR_PTR(-EINVAL);
79}
80
81/*
82 * Convert from in-memory to filesystem representation.
83 */
84static void *
85ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
86{
87 ext3_acl_header *ext_acl;
88 char *e;
89 size_t n;
90
91 *size = ext3_acl_size(acl->a_count);
92 ext_acl = (ext3_acl_header *)kmalloc(sizeof(ext3_acl_header) +
93 acl->a_count * sizeof(ext3_acl_entry), GFP_KERNEL);
94 if (!ext_acl)
95 return ERR_PTR(-ENOMEM);
96 ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
97 e = (char *)ext_acl + sizeof(ext3_acl_header);
98 for (n=0; n < acl->a_count; n++) {
99 ext3_acl_entry *entry = (ext3_acl_entry *)e;
100 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
101 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
102 switch(acl->a_entries[n].e_tag) {
103 case ACL_USER:
104 case ACL_GROUP:
105 entry->e_id =
106 cpu_to_le32(acl->a_entries[n].e_id);
107 e += sizeof(ext3_acl_entry);
108 break;
109
110 case ACL_USER_OBJ:
111 case ACL_GROUP_OBJ:
112 case ACL_MASK:
113 case ACL_OTHER:
114 e += sizeof(ext3_acl_entry_short);
115 break;
116
117 default:
118 goto fail;
119 }
120 }
121 return (char *)ext_acl;
122
123fail:
124 kfree(ext_acl);
125 return ERR_PTR(-EINVAL);
126}
127
128static inline struct posix_acl *
129ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
130{
131 struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
132
133 spin_lock(&inode->i_lock);
134 if (*i_acl != EXT3_ACL_NOT_CACHED)
135 acl = posix_acl_dup(*i_acl);
136 spin_unlock(&inode->i_lock);
137
138 return acl;
139}
140
141static inline void
142ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
143 struct posix_acl *acl)
144{
145 spin_lock(&inode->i_lock);
146 if (*i_acl != EXT3_ACL_NOT_CACHED)
147 posix_acl_release(*i_acl);
148 *i_acl = posix_acl_dup(acl);
149 spin_unlock(&inode->i_lock);
150}
151
152/*
153 * Inode operation get_posix_acl().
154 *
155 * inode->i_sem: don't care
156 */
157static struct posix_acl *
158ext3_get_acl(struct inode *inode, int type)
159{
160 struct ext3_inode_info *ei = EXT3_I(inode);
161 int name_index;
162 char *value = NULL;
163 struct posix_acl *acl;
164 int retval;
165
166 if (!test_opt(inode->i_sb, POSIX_ACL))
167 return NULL;
168
169 switch(type) {
170 case ACL_TYPE_ACCESS:
171 acl = ext3_iget_acl(inode, &ei->i_acl);
172 if (acl != EXT3_ACL_NOT_CACHED)
173 return acl;
174 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
175 break;
176
177 case ACL_TYPE_DEFAULT:
178 acl = ext3_iget_acl(inode, &ei->i_default_acl);
179 if (acl != EXT3_ACL_NOT_CACHED)
180 return acl;
181 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
182 break;
183
184 default:
185 return ERR_PTR(-EINVAL);
186 }
187 retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
188 if (retval > 0) {
189 value = kmalloc(retval, GFP_KERNEL);
190 if (!value)
191 return ERR_PTR(-ENOMEM);
192 retval = ext3_xattr_get(inode, name_index, "", value, retval);
193 }
194 if (retval > 0)
195 acl = ext3_acl_from_disk(value, retval);
196 else if (retval == -ENODATA || retval == -ENOSYS)
197 acl = NULL;
198 else
199 acl = ERR_PTR(retval);
200 kfree(value);
201
202 if (!IS_ERR(acl)) {
203 switch(type) {
204 case ACL_TYPE_ACCESS:
205 ext3_iset_acl(inode, &ei->i_acl, acl);
206 break;
207
208 case ACL_TYPE_DEFAULT:
209 ext3_iset_acl(inode, &ei->i_default_acl, acl);
210 break;
211 }
212 }
213 return acl;
214}
215
216/*
217 * Set the access or default ACL of an inode.
218 *
219 * inode->i_sem: down unless called from ext3_new_inode
220 */
221static int
222ext3_set_acl(handle_t *handle, struct inode *inode, int type,
223 struct posix_acl *acl)
224{
225 struct ext3_inode_info *ei = EXT3_I(inode);
226 int name_index;
227 void *value = NULL;
228 size_t size;
229 int error;
230
231 if (S_ISLNK(inode->i_mode))
232 return -EOPNOTSUPP;
233
234 switch(type) {
235 case ACL_TYPE_ACCESS:
236 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
237 if (acl) {
238 mode_t mode = inode->i_mode;
239 error = posix_acl_equiv_mode(acl, &mode);
240 if (error < 0)
241 return error;
242 else {
243 inode->i_mode = mode;
244 ext3_mark_inode_dirty(handle, inode);
245 if (error == 0)
246 acl = NULL;
247 }
248 }
249 break;
250
251 case ACL_TYPE_DEFAULT:
252 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
253 if (!S_ISDIR(inode->i_mode))
254 return acl ? -EACCES : 0;
255 break;
256
257 default:
258 return -EINVAL;
259 }
260 if (acl) {
261 value = ext3_acl_to_disk(acl, &size);
262 if (IS_ERR(value))
263 return (int)PTR_ERR(value);
264 }
265
266 error = ext3_xattr_set_handle(handle, inode, name_index, "",
267 value, size, 0);
268
269 kfree(value);
270 if (!error) {
271 switch(type) {
272 case ACL_TYPE_ACCESS:
273 ext3_iset_acl(inode, &ei->i_acl, acl);
274 break;
275
276 case ACL_TYPE_DEFAULT:
277 ext3_iset_acl(inode, &ei->i_default_acl, acl);
278 break;
279 }
280 }
281 return error;
282}
283
284static int
285ext3_check_acl(struct inode *inode, int mask)
286{
287 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
288
289 if (acl) {
290 int error = posix_acl_permission(inode, acl, mask);
291 posix_acl_release(acl);
292 return error;
293 }
294
295 return -EAGAIN;
296}
297
298int
299ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
300{
301 return generic_permission(inode, mask, ext3_check_acl);
302}
303
304/*
305 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
306 *
307 * dir->i_sem: down
308 * inode->i_sem: up (access to inode is still exclusive)
309 */
310int
311ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
312{
313 struct posix_acl *acl = NULL;
314 int error = 0;
315
316 if (!S_ISLNK(inode->i_mode)) {
317 if (test_opt(dir->i_sb, POSIX_ACL)) {
318 acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT);
319 if (IS_ERR(acl))
320 return PTR_ERR(acl);
321 }
322 if (!acl)
323 inode->i_mode &= ~current->fs->umask;
324 }
325 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
326 struct posix_acl *clone;
327 mode_t mode;
328
329 if (S_ISDIR(inode->i_mode)) {
330 error = ext3_set_acl(handle, inode,
331 ACL_TYPE_DEFAULT, acl);
332 if (error)
333 goto cleanup;
334 }
335 clone = posix_acl_clone(acl, GFP_KERNEL);
336 error = -ENOMEM;
337 if (!clone)
338 goto cleanup;
339
340 mode = inode->i_mode;
341 error = posix_acl_create_masq(clone, &mode);
342 if (error >= 0) {
343 inode->i_mode = mode;
344 if (error > 0) {
345 /* This is an extended ACL */
346 error = ext3_set_acl(handle, inode,
347 ACL_TYPE_ACCESS, clone);
348 }
349 }
350 posix_acl_release(clone);
351 }
352cleanup:
353 posix_acl_release(acl);
354 return error;
355}
356
357/*
358 * Does chmod for an inode that may have an Access Control List. The
359 * inode->i_mode field must be updated to the desired value by the caller
360 * before calling this function.
361 * Returns 0 on success, or a negative error number.
362 *
363 * We change the ACL rather than storing some ACL entries in the file
364 * mode permission bits (which would be more efficient), because that
365 * would break once additional permissions (like ACL_APPEND, ACL_DELETE
366 * for directories) are added. There are no more bits available in the
367 * file mode.
368 *
369 * inode->i_sem: down
370 */
371int
372ext3_acl_chmod(struct inode *inode)
373{
374 struct posix_acl *acl, *clone;
375 int error;
376
377 if (S_ISLNK(inode->i_mode))
378 return -EOPNOTSUPP;
379 if (!test_opt(inode->i_sb, POSIX_ACL))
380 return 0;
381 acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
382 if (IS_ERR(acl) || !acl)
383 return PTR_ERR(acl);
384 clone = posix_acl_clone(acl, GFP_KERNEL);
385 posix_acl_release(acl);
386 if (!clone)
387 return -ENOMEM;
388 error = posix_acl_chmod_masq(clone, inode->i_mode);
389 if (!error) {
390 handle_t *handle;
391 int retries = 0;
392
393 retry:
394 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS);
395 if (IS_ERR(handle)) {
396 error = PTR_ERR(handle);
397 ext3_std_error(inode->i_sb, error);
398 goto out;
399 }
400 error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
401 ext3_journal_stop(handle);
402 if (error == -ENOSPC &&
403 ext3_should_retry_alloc(inode->i_sb, &retries))
404 goto retry;
405 }
406out:
407 posix_acl_release(clone);
408 return error;
409}
410
411/*
412 * Extended attribute handlers
413 */
414static size_t
415ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
416 const char *name, size_t name_len)
417{
418 const size_t size = sizeof(XATTR_NAME_ACL_ACCESS);
419
420 if (!test_opt(inode->i_sb, POSIX_ACL))
421 return 0;
422 if (list && size <= list_len)
423 memcpy(list, XATTR_NAME_ACL_ACCESS, size);
424 return size;
425}
426
427static size_t
428ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
429 const char *name, size_t name_len)
430{
431 const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT);
432
433 if (!test_opt(inode->i_sb, POSIX_ACL))
434 return 0;
435 if (list && size <= list_len)
436 memcpy(list, XATTR_NAME_ACL_DEFAULT, size);
437 return size;
438}
439
440static int
441ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
442{
443 struct posix_acl *acl;
444 int error;
445
446 if (!test_opt(inode->i_sb, POSIX_ACL))
447 return -EOPNOTSUPP;
448
449 acl = ext3_get_acl(inode, type);
450 if (IS_ERR(acl))
451 return PTR_ERR(acl);
452 if (acl == NULL)
453 return -ENODATA;
454 error = posix_acl_to_xattr(acl, buffer, size);
455 posix_acl_release(acl);
456
457 return error;
458}
459
460static int
461ext3_xattr_get_acl_access(struct inode *inode, const char *name,
462 void *buffer, size_t size)
463{
464 if (strcmp(name, "") != 0)
465 return -EINVAL;
466 return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
467}
468
469static int
470ext3_xattr_get_acl_default(struct inode *inode, const char *name,
471 void *buffer, size_t size)
472{
473 if (strcmp(name, "") != 0)
474 return -EINVAL;
475 return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
476}
477
478static int
479ext3_xattr_set_acl(struct inode *inode, int type, const void *value,
480 size_t size)
481{
482 handle_t *handle;
483 struct posix_acl *acl;
484 int error, retries = 0;
485
486 if (!test_opt(inode->i_sb, POSIX_ACL))
487 return -EOPNOTSUPP;
488 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
489 return -EPERM;
490
491 if (value) {
492 acl = posix_acl_from_xattr(value, size);
493 if (IS_ERR(acl))
494 return PTR_ERR(acl);
495 else if (acl) {
496 error = posix_acl_valid(acl);
497 if (error)
498 goto release_and_out;
499 }
500 } else
501 acl = NULL;
502
503retry:
504 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS);
505 if (IS_ERR(handle))
506 return PTR_ERR(handle);
507 error = ext3_set_acl(handle, inode, type, acl);
508 ext3_journal_stop(handle);
509 if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
510 goto retry;
511
512release_and_out:
513 posix_acl_release(acl);
514 return error;
515}
516
517static int
518ext3_xattr_set_acl_access(struct inode *inode, const char *name,
519 const void *value, size_t size, int flags)
520{
521 if (strcmp(name, "") != 0)
522 return -EINVAL;
523 return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
524}
525
526static int
527ext3_xattr_set_acl_default(struct inode *inode, const char *name,
528 const void *value, size_t size, int flags)
529{
530 if (strcmp(name, "") != 0)
531 return -EINVAL;
532 return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
533}
534
535struct xattr_handler ext3_xattr_acl_access_handler = {
536 .prefix = XATTR_NAME_ACL_ACCESS,
537 .list = ext3_xattr_list_acl_access,
538 .get = ext3_xattr_get_acl_access,
539 .set = ext3_xattr_set_acl_access,
540};
541
542struct xattr_handler ext3_xattr_acl_default_handler = {
543 .prefix = XATTR_NAME_ACL_DEFAULT,
544 .list = ext3_xattr_list_acl_default,
545 .get = ext3_xattr_get_acl_default,
546 .set = ext3_xattr_set_acl_default,
547};
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
new file mode 100644
index 000000000000..98af0c0d0ba9
--- /dev/null
+++ b/fs/ext3/acl.h
@@ -0,0 +1,84 @@
1/*
2 File: fs/ext3/acl.h
3
4 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
5*/
6
7#include <linux/xattr_acl.h>
8
9#define EXT3_ACL_VERSION 0x0001
10
11typedef struct {
12 __le16 e_tag;
13 __le16 e_perm;
14 __le32 e_id;
15} ext3_acl_entry;
16
17typedef struct {
18 __le16 e_tag;
19 __le16 e_perm;
20} ext3_acl_entry_short;
21
22typedef struct {
23 __le32 a_version;
24} ext3_acl_header;
25
26static inline size_t ext3_acl_size(int count)
27{
28 if (count <= 4) {
29 return sizeof(ext3_acl_header) +
30 count * sizeof(ext3_acl_entry_short);
31 } else {
32 return sizeof(ext3_acl_header) +
33 4 * sizeof(ext3_acl_entry_short) +
34 (count - 4) * sizeof(ext3_acl_entry);
35 }
36}
37
38static inline int ext3_acl_count(size_t size)
39{
40 ssize_t s;
41 size -= sizeof(ext3_acl_header);
42 s = size - 4 * sizeof(ext3_acl_entry_short);
43 if (s < 0) {
44 if (size % sizeof(ext3_acl_entry_short))
45 return -1;
46 return size / sizeof(ext3_acl_entry_short);
47 } else {
48 if (s % sizeof(ext3_acl_entry))
49 return -1;
50 return s / sizeof(ext3_acl_entry) + 4;
51 }
52}
53
54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55
56/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl
57 if the ACL has not been cached */
58#define EXT3_ACL_NOT_CACHED ((void *)-1)
59
60/* acl.c */
61extern int ext3_permission (struct inode *, int, struct nameidata *);
62extern int ext3_acl_chmod (struct inode *);
63extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
64
65extern int init_ext3_acl(void);
66extern void exit_ext3_acl(void);
67
68#else /* CONFIG_EXT3_FS_POSIX_ACL */
69#include <linux/sched.h>
70#define ext3_permission NULL
71
72static inline int
73ext3_acl_chmod(struct inode *inode)
74{
75 return 0;
76}
77
78static inline int
79ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
80{
81 return 0;
82}
83#endif /* CONFIG_EXT3_FS_POSIX_ACL */
84
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
new file mode 100644
index 000000000000..ccd632fcc6d8
--- /dev/null
+++ b/fs/ext3/balloc.c
@@ -0,0 +1,1600 @@
1/*
2 * linux/fs/ext3/balloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10 * Big-endian to little-endian byte-swapping/bitmaps by
11 * David S. Miller (davem@caip.rutgers.edu), 1995
12 */
13
14#include <linux/config.h>
15#include <linux/time.h>
16#include <linux/fs.h>
17#include <linux/jbd.h>
18#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h>
20#include <linux/quotaops.h>
21#include <linux/buffer_head.h>
22
23/*
24 * balloc.c contains the blocks allocation and deallocation routines
25 */
26
27/*
28 * The free blocks are managed by bitmaps. A file system contains several
29 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
30 * block for inodes, N blocks for the inode table and data blocks.
31 *
32 * The file system contains group descriptors which are located after the
33 * super block. Each descriptor contains the number of the bitmap block and
34 * the free blocks count in the block. The descriptors are loaded in memory
35 * when a file system is mounted (see ext3_read_super).
36 */
37
38
39#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
40
41struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
42 unsigned int block_group,
43 struct buffer_head ** bh)
44{
45 unsigned long group_desc;
46 unsigned long offset;
47 struct ext3_group_desc * desc;
48 struct ext3_sb_info *sbi = EXT3_SB(sb);
49
50 if (block_group >= sbi->s_groups_count) {
51 ext3_error (sb, "ext3_get_group_desc",
52 "block_group >= groups_count - "
53 "block_group = %d, groups_count = %lu",
54 block_group, sbi->s_groups_count);
55
56 return NULL;
57 }
58 smp_rmb();
59
60 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
61 offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
62 if (!sbi->s_group_desc[group_desc]) {
63 ext3_error (sb, "ext3_get_group_desc",
64 "Group descriptor not loaded - "
65 "block_group = %d, group_desc = %lu, desc = %lu",
66 block_group, group_desc, offset);
67 return NULL;
68 }
69
70 desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
71 if (bh)
72 *bh = sbi->s_group_desc[group_desc];
73 return desc + offset;
74}
75
76/*
77 * Read the bitmap for a given block_group, reading into the specified
78 * slot in the superblock's bitmap cache.
79 *
80 * Return buffer_head on success or NULL in case of failure.
81 */
82static struct buffer_head *
83read_block_bitmap(struct super_block *sb, unsigned int block_group)
84{
85 struct ext3_group_desc * desc;
86 struct buffer_head * bh = NULL;
87
88 desc = ext3_get_group_desc (sb, block_group, NULL);
89 if (!desc)
90 goto error_out;
91 bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
92 if (!bh)
93 ext3_error (sb, "read_block_bitmap",
94 "Cannot read block bitmap - "
95 "block_group = %d, block_bitmap = %u",
96 block_group, le32_to_cpu(desc->bg_block_bitmap));
97error_out:
98 return bh;
99}
100/*
101 * The reservation window structure operations
102 * --------------------------------------------
103 * Operations include:
104 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
105 *
106 * We use sorted double linked list for the per-filesystem reservation
107 * window list. (like in vm_region).
108 *
109 * Initially, we keep those small operations in the abstract functions,
110 * so later if we need a better searching tree than double linked-list,
111 * we could easily switch to that without changing too much
112 * code.
113 */
114#if 0
115static void __rsv_window_dump(struct rb_root *root, int verbose,
116 const char *fn)
117{
118 struct rb_node *n;
119 struct ext3_reserve_window_node *rsv, *prev;
120 int bad;
121
122restart:
123 n = rb_first(root);
124 bad = 0;
125 prev = NULL;
126
127 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
128 while (n) {
129 rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node);
130 if (verbose)
131 printk("reservation window 0x%p "
132 "start: %d, end: %d\n",
133 rsv, rsv->rsv_start, rsv->rsv_end);
134 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
135 printk("Bad reservation %p (start >= end)\n",
136 rsv);
137 bad = 1;
138 }
139 if (prev && prev->rsv_end >= rsv->rsv_start) {
140 printk("Bad reservation %p (prev->end >= start)\n",
141 rsv);
142 bad = 1;
143 }
144 if (bad) {
145 if (!verbose) {
146 printk("Restarting reservation walk in verbose mode\n");
147 verbose = 1;
148 goto restart;
149 }
150 }
151 n = rb_next(n);
152 prev = rsv;
153 }
154 printk("Window map complete.\n");
155 if (bad)
156 BUG();
157}
158#define rsv_window_dump(root, verbose) \
159 __rsv_window_dump((root), (verbose), __FUNCTION__)
160#else
161#define rsv_window_dump(root, verbose) do {} while (0)
162#endif
163
164static int
165goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
166 unsigned int group, struct super_block * sb)
167{
168 unsigned long group_first_block, group_last_block;
169
170 group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
171 group * EXT3_BLOCKS_PER_GROUP(sb);
172 group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
173
174 if ((rsv->_rsv_start > group_last_block) ||
175 (rsv->_rsv_end < group_first_block))
176 return 0;
177 if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
178 || (goal + group_first_block > rsv->_rsv_end)))
179 return 0;
180 return 1;
181}
182
183/*
184 * Find the reserved window which includes the goal, or the previous one
185 * if the goal is not in any window.
186 * Returns NULL if there are no windows or if all windows start after the goal.
187 */
188static struct ext3_reserve_window_node *
189search_reserve_window(struct rb_root *root, unsigned long goal)
190{
191 struct rb_node *n = root->rb_node;
192 struct ext3_reserve_window_node *rsv;
193
194 if (!n)
195 return NULL;
196
197 do {
198 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
199
200 if (goal < rsv->rsv_start)
201 n = n->rb_left;
202 else if (goal > rsv->rsv_end)
203 n = n->rb_right;
204 else
205 return rsv;
206 } while (n);
207 /*
208 * We've fallen off the end of the tree: the goal wasn't inside
209 * any particular node. OK, the previous node must be to one
210 * side of the interval containing the goal. If it's the RHS,
211 * we need to back up one.
212 */
213 if (rsv->rsv_start > goal) {
214 n = rb_prev(&rsv->rsv_node);
215 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
216 }
217 return rsv;
218}
219
220void ext3_rsv_window_add(struct super_block *sb,
221 struct ext3_reserve_window_node *rsv)
222{
223 struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
224 struct rb_node *node = &rsv->rsv_node;
225 unsigned int start = rsv->rsv_start;
226
227 struct rb_node ** p = &root->rb_node;
228 struct rb_node * parent = NULL;
229 struct ext3_reserve_window_node *this;
230
231 while (*p)
232 {
233 parent = *p;
234 this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
235
236 if (start < this->rsv_start)
237 p = &(*p)->rb_left;
238 else if (start > this->rsv_end)
239 p = &(*p)->rb_right;
240 else
241 BUG();
242 }
243
244 rb_link_node(node, parent, p);
245 rb_insert_color(node, root);
246}
247
248static void rsv_window_remove(struct super_block *sb,
249 struct ext3_reserve_window_node *rsv)
250{
251 rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
252 rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
253 rsv->rsv_alloc_hit = 0;
254 rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
255}
256
257static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
258{
259 /* a valid reservation end block could not be 0 */
260 return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED);
261}
262void ext3_init_block_alloc_info(struct inode *inode)
263{
264 struct ext3_inode_info *ei = EXT3_I(inode);
265 struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
266 struct super_block *sb = inode->i_sb;
267
268 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
269 if (block_i) {
270 struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
271
272 rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
273 rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
274
275 /*
276 * if filesystem is mounted with NORESERVATION, the goal
277 * reservation window size is set to zero to indicate
278 * block reservation is off
279 */
280 if (!test_opt(sb, RESERVATION))
281 rsv->rsv_goal_size = 0;
282 else
283 rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
284 rsv->rsv_alloc_hit = 0;
285 block_i->last_alloc_logical_block = 0;
286 block_i->last_alloc_physical_block = 0;
287 }
288 ei->i_block_alloc_info = block_i;
289}
290
291void ext3_discard_reservation(struct inode *inode)
292{
293 struct ext3_inode_info *ei = EXT3_I(inode);
294 struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
295 struct ext3_reserve_window_node *rsv;
296 spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
297
298 if (!block_i)
299 return;
300
301 rsv = &block_i->rsv_window_node;
302 if (!rsv_is_empty(&rsv->rsv_window)) {
303 spin_lock(rsv_lock);
304 if (!rsv_is_empty(&rsv->rsv_window))
305 rsv_window_remove(inode->i_sb, rsv);
306 spin_unlock(rsv_lock);
307 }
308}
309
310/* Free given blocks, update quota and i_blocks field */
311void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
312 unsigned long block, unsigned long count,
313 int *pdquot_freed_blocks)
314{
315 struct buffer_head *bitmap_bh = NULL;
316 struct buffer_head *gd_bh;
317 unsigned long block_group;
318 unsigned long bit;
319 unsigned long i;
320 unsigned long overflow;
321 struct ext3_group_desc * desc;
322 struct ext3_super_block * es;
323 struct ext3_sb_info *sbi;
324 int err = 0, ret;
325 unsigned group_freed;
326
327 *pdquot_freed_blocks = 0;
328 sbi = EXT3_SB(sb);
329 es = sbi->s_es;
330 if (block < le32_to_cpu(es->s_first_data_block) ||
331 block + count < block ||
332 block + count > le32_to_cpu(es->s_blocks_count)) {
333 ext3_error (sb, "ext3_free_blocks",
334 "Freeing blocks not in datazone - "
335 "block = %lu, count = %lu", block, count);
336 goto error_return;
337 }
338
339 ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
340
341do_more:
342 overflow = 0;
343 block_group = (block - le32_to_cpu(es->s_first_data_block)) /
344 EXT3_BLOCKS_PER_GROUP(sb);
345 bit = (block - le32_to_cpu(es->s_first_data_block)) %
346 EXT3_BLOCKS_PER_GROUP(sb);
347 /*
348 * Check to see if we are freeing blocks across a group
349 * boundary.
350 */
351 if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
352 overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
353 count -= overflow;
354 }
355 brelse(bitmap_bh);
356 bitmap_bh = read_block_bitmap(sb, block_group);
357 if (!bitmap_bh)
358 goto error_return;
359 desc = ext3_get_group_desc (sb, block_group, &gd_bh);
360 if (!desc)
361 goto error_return;
362
363 if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
364 in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
365 in_range (block, le32_to_cpu(desc->bg_inode_table),
366 sbi->s_itb_per_group) ||
367 in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
368 sbi->s_itb_per_group))
369 ext3_error (sb, "ext3_free_blocks",
370 "Freeing blocks in system zones - "
371 "Block = %lu, count = %lu",
372 block, count);
373
374 /*
375 * We are about to start releasing blocks in the bitmap,
376 * so we need undo access.
377 */
378 /* @@@ check errors */
379 BUFFER_TRACE(bitmap_bh, "getting undo access");
380 err = ext3_journal_get_undo_access(handle, bitmap_bh);
381 if (err)
382 goto error_return;
383
384 /*
385 * We are about to modify some metadata. Call the journal APIs
386 * to unshare ->b_data if a currently-committing transaction is
387 * using it
388 */
389 BUFFER_TRACE(gd_bh, "get_write_access");
390 err = ext3_journal_get_write_access(handle, gd_bh);
391 if (err)
392 goto error_return;
393
394 jbd_lock_bh_state(bitmap_bh);
395
396 for (i = 0, group_freed = 0; i < count; i++) {
397 /*
398 * An HJ special. This is expensive...
399 */
400#ifdef CONFIG_JBD_DEBUG
401 jbd_unlock_bh_state(bitmap_bh);
402 {
403 struct buffer_head *debug_bh;
404 debug_bh = sb_find_get_block(sb, block + i);
405 if (debug_bh) {
406 BUFFER_TRACE(debug_bh, "Deleted!");
407 if (!bh2jh(bitmap_bh)->b_committed_data)
408 BUFFER_TRACE(debug_bh,
409 "No commited data in bitmap");
410 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
411 __brelse(debug_bh);
412 }
413 }
414 jbd_lock_bh_state(bitmap_bh);
415#endif
416 if (need_resched()) {
417 jbd_unlock_bh_state(bitmap_bh);
418 cond_resched();
419 jbd_lock_bh_state(bitmap_bh);
420 }
421 /* @@@ This prevents newly-allocated data from being
422 * freed and then reallocated within the same
423 * transaction.
424 *
425 * Ideally we would want to allow that to happen, but to
426 * do so requires making journal_forget() capable of
427 * revoking the queued write of a data block, which
428 * implies blocking on the journal lock. *forget()
429 * cannot block due to truncate races.
430 *
431 * Eventually we can fix this by making journal_forget()
432 * return a status indicating whether or not it was able
433 * to revoke the buffer. On successful revoke, it is
434 * safe not to set the allocation bit in the committed
435 * bitmap, because we know that there is no outstanding
436 * activity on the buffer any more and so it is safe to
437 * reallocate it.
438 */
439 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
440 J_ASSERT_BH(bitmap_bh,
441 bh2jh(bitmap_bh)->b_committed_data != NULL);
442 ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
443 bh2jh(bitmap_bh)->b_committed_data);
444
445 /*
446 * We clear the bit in the bitmap after setting the committed
447 * data bit, because this is the reverse order to that which
448 * the allocator uses.
449 */
450 BUFFER_TRACE(bitmap_bh, "clear bit");
451 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
452 bit + i, bitmap_bh->b_data)) {
453 jbd_unlock_bh_state(bitmap_bh);
454 ext3_error(sb, __FUNCTION__,
455 "bit already cleared for block %lu", block + i);
456 jbd_lock_bh_state(bitmap_bh);
457 BUFFER_TRACE(bitmap_bh, "bit already cleared");
458 } else {
459 group_freed++;
460 }
461 }
462 jbd_unlock_bh_state(bitmap_bh);
463
464 spin_lock(sb_bgl_lock(sbi, block_group));
465 desc->bg_free_blocks_count =
466 cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
467 group_freed);
468 spin_unlock(sb_bgl_lock(sbi, block_group));
469 percpu_counter_mod(&sbi->s_freeblocks_counter, count);
470
471 /* We dirtied the bitmap block */
472 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
473 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
474
475 /* And the group descriptor block */
476 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
477 ret = ext3_journal_dirty_metadata(handle, gd_bh);
478 if (!err) err = ret;
479 *pdquot_freed_blocks += group_freed;
480
481 if (overflow && !err) {
482 block += count;
483 count = overflow;
484 goto do_more;
485 }
486 sb->s_dirt = 1;
487error_return:
488 brelse(bitmap_bh);
489 ext3_std_error(sb, err);
490 return;
491}
492
493/* Free given blocks, update quota and i_blocks field */
494void ext3_free_blocks(handle_t *handle, struct inode *inode,
495 unsigned long block, unsigned long count)
496{
497 struct super_block * sb;
498 int dquot_freed_blocks;
499
500 sb = inode->i_sb;
501 if (!sb) {
502 printk ("ext3_free_blocks: nonexistent device");
503 return;
504 }
505 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
506 if (dquot_freed_blocks)
507 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
508 return;
509}
510
511/*
512 * For ext3 allocations, we must not reuse any blocks which are
513 * allocated in the bitmap buffer's "last committed data" copy. This
514 * prevents deletes from freeing up the page for reuse until we have
515 * committed the delete transaction.
516 *
517 * If we didn't do this, then deleting something and reallocating it as
518 * data would allow the old block to be overwritten before the
519 * transaction committed (because we force data to disk before commit).
520 * This would lead to corruption if we crashed between overwriting the
521 * data and committing the delete.
522 *
523 * @@@ We may want to make this allocation behaviour conditional on
524 * data-writes at some point, and disable it for metadata allocations or
525 * sync-data inodes.
526 */
527static int ext3_test_allocatable(int nr, struct buffer_head *bh)
528{
529 int ret;
530 struct journal_head *jh = bh2jh(bh);
531
532 if (ext3_test_bit(nr, bh->b_data))
533 return 0;
534
535 jbd_lock_bh_state(bh);
536 if (!jh->b_committed_data)
537 ret = 1;
538 else
539 ret = !ext3_test_bit(nr, jh->b_committed_data);
540 jbd_unlock_bh_state(bh);
541 return ret;
542}
543
544static int
545bitmap_search_next_usable_block(int start, struct buffer_head *bh,
546 int maxblocks)
547{
548 int next;
549 struct journal_head *jh = bh2jh(bh);
550
551 /*
552 * The bitmap search --- search forward alternately through the actual
553 * bitmap and the last-committed copy until we find a bit free in
554 * both
555 */
556 while (start < maxblocks) {
557 next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
558 if (next >= maxblocks)
559 return -1;
560 if (ext3_test_allocatable(next, bh))
561 return next;
562 jbd_lock_bh_state(bh);
563 if (jh->b_committed_data)
564 start = ext3_find_next_zero_bit(jh->b_committed_data,
565 maxblocks, next);
566 jbd_unlock_bh_state(bh);
567 }
568 return -1;
569}
570
571/*
572 * Find an allocatable block in a bitmap. We honour both the bitmap and
573 * its last-committed copy (if that exists), and perform the "most
574 * appropriate allocation" algorithm of looking for a free block near
575 * the initial goal; then for a free byte somewhere in the bitmap; then
576 * for any free bit in the bitmap.
577 */
578static int
579find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
580{
581 int here, next;
582 char *p, *r;
583
584 if (start > 0) {
585 /*
586 * The goal was occupied; search forward for a free
587 * block within the next XX blocks.
588 *
589 * end_goal is more or less random, but it has to be
590 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
591 * next 64-bit boundary is simple..
592 */
593 int end_goal = (start + 63) & ~63;
594 if (end_goal > maxblocks)
595 end_goal = maxblocks;
596 here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
597 if (here < end_goal && ext3_test_allocatable(here, bh))
598 return here;
599 ext3_debug("Bit not found near goal\n");
600 }
601
602 here = start;
603 if (here < 0)
604 here = 0;
605
606 p = ((char *)bh->b_data) + (here >> 3);
607 r = memscan(p, 0, (maxblocks - here + 7) >> 3);
608 next = (r - ((char *)bh->b_data)) << 3;
609
610 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
611 return next;
612
613 /*
614 * The bitmap search --- search forward alternately through the actual
615 * bitmap and the last-committed copy until we find a bit free in
616 * both
617 */
618 here = bitmap_search_next_usable_block(here, bh, maxblocks);
619 return here;
620}
621
622/*
623 * We think we can allocate this block in this bitmap. Try to set the bit.
624 * If that succeeds then check that nobody has allocated and then freed the
625 * block since we saw that is was not marked in b_committed_data. If it _was_
626 * allocated and freed then clear the bit in the bitmap again and return
627 * zero (failure).
628 */
629static inline int
630claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
631{
632 struct journal_head *jh = bh2jh(bh);
633 int ret;
634
635 if (ext3_set_bit_atomic(lock, block, bh->b_data))
636 return 0;
637 jbd_lock_bh_state(bh);
638 if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
639 ext3_clear_bit_atomic(lock, block, bh->b_data);
640 ret = 0;
641 } else {
642 ret = 1;
643 }
644 jbd_unlock_bh_state(bh);
645 return ret;
646}
647
648/*
649 * If we failed to allocate the desired block then we may end up crossing to a
650 * new bitmap. In that case we must release write access to the old one via
651 * ext3_journal_release_buffer(), else we'll run out of credits.
652 */
653static int
654ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
655 struct buffer_head *bitmap_bh, int goal, struct ext3_reserve_window *my_rsv)
656{
657 int group_first_block, start, end;
658
659 /* we do allocation within the reservation window if we have a window */
660 if (my_rsv) {
661 group_first_block =
662 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
663 group * EXT3_BLOCKS_PER_GROUP(sb);
664 if (my_rsv->_rsv_start >= group_first_block)
665 start = my_rsv->_rsv_start - group_first_block;
666 else
667 /* reservation window cross group boundary */
668 start = 0;
669 end = my_rsv->_rsv_end - group_first_block + 1;
670 if (end > EXT3_BLOCKS_PER_GROUP(sb))
671 /* reservation window crosses group boundary */
672 end = EXT3_BLOCKS_PER_GROUP(sb);
673 if ((start <= goal) && (goal < end))
674 start = goal;
675 else
676 goal = -1;
677 } else {
678 if (goal > 0)
679 start = goal;
680 else
681 start = 0;
682 end = EXT3_BLOCKS_PER_GROUP(sb);
683 }
684
685 BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
686
687repeat:
688 if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
689 goal = find_next_usable_block(start, bitmap_bh, end);
690 if (goal < 0)
691 goto fail_access;
692 if (!my_rsv) {
693 int i;
694
695 for (i = 0; i < 7 && goal > start &&
696 ext3_test_allocatable(goal - 1,
697 bitmap_bh);
698 i++, goal--)
699 ;
700 }
701 }
702 start = goal;
703
704 if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
705 /*
706 * The block was allocated by another thread, or it was
707 * allocated and then freed by another thread
708 */
709 start++;
710 goal++;
711 if (start >= end)
712 goto fail_access;
713 goto repeat;
714 }
715 return goal;
716fail_access:
717 return -1;
718}
719
720/**
721 * find_next_reservable_window():
722 * find a reservable space within the given range.
723 * It does not allocate the reservation window for now:
724 * alloc_new_reservation() will do the work later.
725 *
726 * @search_head: the head of the searching list;
727 * This is not necessarily the list head of the whole filesystem
728 *
729 * We have both head and start_block to assist the search
730 * for the reservable space. The list starts from head,
731 * but we will shift to the place where start_block is,
732 * then start from there, when looking for a reservable space.
733 *
734 * @size: the target new reservation window size
735 *
736 * @group_first_block: the first block we consider to start
737 * the real search from
738 *
739 * @last_block:
740 * the maximum block number that our goal reservable space
741 * could start from. This is normally the last block in this
742 * group. The search will end when we found the start of next
743 * possible reservable space is out of this boundary.
744 * This could handle the cross boundary reservation window
745 * request.
746 *
747 * basically we search from the given range, rather than the whole
748 * reservation double linked list, (start_block, last_block)
749 * to find a free region that is of my size and has not
750 * been reserved.
751 *
752 * on succeed, it returns the reservation window to be appended to.
753 * failed, return NULL.
754 */
755static struct ext3_reserve_window_node *find_next_reservable_window(
756 struct ext3_reserve_window_node *search_head,
757 unsigned long size, int *start_block,
758 int last_block)
759{
760 struct rb_node *next;
761 struct ext3_reserve_window_node *rsv, *prev;
762 int cur;
763
764 /* TODO: make the start of the reservation window byte-aligned */
765 /* cur = *start_block & ~7;*/
766 cur = *start_block;
767 rsv = search_head;
768 if (!rsv)
769 return NULL;
770
771 while (1) {
772 if (cur <= rsv->rsv_end)
773 cur = rsv->rsv_end + 1;
774
775 /* TODO?
776 * in the case we could not find a reservable space
777 * that is what is expected, during the re-search, we could
778 * remember what's the largest reservable space we could have
779 * and return that one.
780 *
781 * For now it will fail if we could not find the reservable
782 * space with expected-size (or more)...
783 */
784 if (cur > last_block)
785 return NULL; /* fail */
786
787 prev = rsv;
788 next = rb_next(&rsv->rsv_node);
789 rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node);
790
791 /*
792 * Reached the last reservation, we can just append to the
793 * previous one.
794 */
795 if (!next)
796 break;
797
798 if (cur + size <= rsv->rsv_start) {
799 /*
800 * Found a reserveable space big enough. We could
801 * have a reservation across the group boundary here
802 */
803 break;
804 }
805 }
806 /*
807 * we come here either :
808 * when we reach the end of the whole list,
809 * and there is empty reservable space after last entry in the list.
810 * append it to the end of the list.
811 *
812 * or we found one reservable space in the middle of the list,
813 * return the reservation window that we could append to.
814 * succeed.
815 */
816 *start_block = cur;
817 return prev;
818}
819
820/**
821 * alloc_new_reservation()--allocate a new reservation window
822 *
823 * To make a new reservation, we search part of the filesystem
824 * reservation list (the list that inside the group). We try to
825 * allocate a new reservation window near the allocation goal,
826 * or the beginning of the group, if there is no goal.
827 *
828 * We first find a reservable space after the goal, then from
829 * there, we check the bitmap for the first free block after
830 * it. If there is no free block until the end of group, then the
831 * whole group is full, we failed. Otherwise, check if the free
832 * block is inside the expected reservable space, if so, we
833 * succeed.
834 * If the first free block is outside the reservable space, then
835 * start from the first free block, we search for next available
836 * space, and go on.
837 *
838 * on succeed, a new reservation will be found and inserted into the list
839 * It contains at least one free block, and it does not overlap with other
840 * reservation windows.
841 *
842 * failed: we failed to find a reservation window in this group
843 *
844 * @rsv: the reservation
845 *
846 * @goal: The goal (group-relative). It is where the search for a
847 * free reservable space should start from.
848 * if we have a goal(goal >0 ), then start from there,
849 * no goal(goal = -1), we start from the first block
850 * of the group.
851 *
852 * @sb: the super block
853 * @group: the group we are trying to allocate in
854 * @bitmap_bh: the block group block bitmap
855 */
856static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
857 int goal, struct super_block *sb,
858 unsigned int group, struct buffer_head *bitmap_bh)
859{
860 struct ext3_reserve_window_node *search_head;
861 int group_first_block, group_end_block, start_block;
862 int first_free_block;
863 int reservable_space_start;
864 struct ext3_reserve_window_node *prev_rsv;
865 struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
866 unsigned long size;
867
868 group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
869 group * EXT3_BLOCKS_PER_GROUP(sb);
870 group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
871
872 if (goal < 0)
873 start_block = group_first_block;
874 else
875 start_block = goal + group_first_block;
876
877 size = my_rsv->rsv_goal_size;
878 if (!rsv_is_empty(&my_rsv->rsv_window)) {
879 /*
880 * if the old reservation is cross group boundary
881 * and if the goal is inside the old reservation window,
882 * we will come here when we just failed to allocate from
883 * the first part of the window. We still have another part
884 * that belongs to the next group. In this case, there is no
885 * point to discard our window and try to allocate a new one
886 * in this group(which will fail). we should
887 * keep the reservation window, just simply move on.
888 *
889 * Maybe we could shift the start block of the reservation
890 * window to the first block of next group.
891 */
892
893 if ((my_rsv->rsv_start <= group_end_block) &&
894 (my_rsv->rsv_end > group_end_block) &&
895 (start_block >= my_rsv->rsv_start))
896 return -1;
897
898 if ((my_rsv->rsv_alloc_hit >
899 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
900 /*
901 * if we previously allocation hit ration is greater than half
902 * we double the size of reservation window next time
903 * otherwise keep the same
904 */
905 size = size * 2;
906 if (size > EXT3_MAX_RESERVE_BLOCKS)
907 size = EXT3_MAX_RESERVE_BLOCKS;
908 my_rsv->rsv_goal_size= size;
909 }
910 }
911 /*
912 * shift the search start to the window near the goal block
913 */
914 search_head = search_reserve_window(fs_rsv_root, start_block);
915
916 /*
917 * find_next_reservable_window() simply finds a reservable window
918 * inside the given range(start_block, group_end_block).
919 *
920 * To make sure the reservation window has a free bit inside it, we
921 * need to check the bitmap after we found a reservable window.
922 */
923retry:
924 prev_rsv = find_next_reservable_window(search_head, size,
925 &start_block, group_end_block);
926 if (prev_rsv == NULL)
927 goto failed;
928 reservable_space_start = start_block;
929 /*
930 * On success, find_next_reservable_window() returns the
931 * reservation window where there is a reservable space after it.
932 * Before we reserve this reservable space, we need
933 * to make sure there is at least a free block inside this region.
934 *
935 * searching the first free bit on the block bitmap and copy of
936 * last committed bitmap alternatively, until we found a allocatable
937 * block. Search start from the start block of the reservable space
938 * we just found.
939 */
940 first_free_block = bitmap_search_next_usable_block(
941 reservable_space_start - group_first_block,
942 bitmap_bh, group_end_block - group_first_block + 1);
943
944 if (first_free_block < 0) {
945 /*
946 * no free block left on the bitmap, no point
947 * to reserve the space. return failed.
948 */
949 goto failed;
950 }
951 start_block = first_free_block + group_first_block;
952 /*
953 * check if the first free block is within the
954 * free space we just found
955 */
956 if ((start_block >= reservable_space_start) &&
957 (start_block < reservable_space_start + size))
958 goto found_rsv_window;
959 /*
960 * if the first free bit we found is out of the reservable space
961 * this means there is no free block on the reservable space
962 * we should continue search for next reservable space,
963 * start from where the free block is,
964 * we also shift the list head to where we stopped last time
965 */
966 search_head = prev_rsv;
967 goto retry;
968
969found_rsv_window:
970 /*
971 * great! the reservable space contains some free blocks.
972 * if the search returns that we should add the new
973 * window just next to where the old window, we don't
974 * need to remove the old window first then add it to the
975 * same place, just update the new start and new end.
976 */
977 if (my_rsv != prev_rsv) {
978 if (!rsv_is_empty(&my_rsv->rsv_window))
979 rsv_window_remove(sb, my_rsv);
980 }
981 my_rsv->rsv_start = reservable_space_start;
982 my_rsv->rsv_end = my_rsv->rsv_start + size - 1;
983 my_rsv->rsv_alloc_hit = 0;
984 if (my_rsv != prev_rsv) {
985 ext3_rsv_window_add(sb, my_rsv);
986 }
987 return 0; /* succeed */
988failed:
989 /*
990 * failed to find a new reservation window in the current
991 * group, remove the current(stale) reservation window
992 * if there is any
993 */
994 if (!rsv_is_empty(&my_rsv->rsv_window))
995 rsv_window_remove(sb, my_rsv);
996 return -1; /* failed */
997}
998
999/*
1000 * This is the main function used to allocate a new block and its reservation
1001 * window.
1002 *
1003 * Each time when a new block allocation is need, first try to allocate from
1004 * its own reservation. If it does not have a reservation window, instead of
1005 * looking for a free bit on bitmap first, then look up the reservation list to
1006 * see if it is inside somebody else's reservation window, we try to allocate a
1007 * reservation window for it starting from the goal first. Then do the block
1008 * allocation within the reservation window.
1009 *
1010 * This will avoid keeping on searching the reservation list again and
1011 * again when someboday is looking for a free block (without
1012 * reservation), and there are lots of free blocks, but they are all
1013 * being reserved.
1014 *
1015 * We use a sorted double linked list for the per-filesystem reservation list.
1016 * The insert, remove and find a free space(non-reserved) operations for the
1017 * sorted double linked list should be fast.
1018 *
1019 */
1020static int
1021ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1022 unsigned int group, struct buffer_head *bitmap_bh,
1023 int goal, struct ext3_reserve_window_node * my_rsv,
1024 int *errp)
1025{
1026 spinlock_t *rsv_lock;
1027 unsigned long group_first_block;
1028 int ret = 0;
1029 int fatal;
1030
1031 *errp = 0;
1032
1033 /*
1034 * Make sure we use undo access for the bitmap, because it is critical
1035 * that we do the frozen_data COW on bitmap buffers in all cases even
1036 * if the buffer is in BJ_Forget state in the committing transaction.
1037 */
1038 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1039 fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
1040 if (fatal) {
1041 *errp = fatal;
1042 return -1;
1043 }
1044
1045 /*
1046 * we don't deal with reservation when
1047 * filesystem is mounted without reservation
1048 * or the file is not a regular file
1049 * or last attempt to allocate a block with reservation turned on failed
1050 */
1051 if (my_rsv == NULL ) {
1052 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL);
1053 goto out;
1054 }
1055 rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1056 /*
1057 * goal is a group relative block number (if there is a goal)
1058 * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
1059 * first block is a filesystem wide block number
1060 * first block is the block number of the first block in this group
1061 */
1062 group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
1063 group * EXT3_BLOCKS_PER_GROUP(sb);
1064
1065 /*
1066 * Basically we will allocate a new block from inode's reservation
1067 * window.
1068 *
1069 * We need to allocate a new reservation window, if:
1070 * a) inode does not have a reservation window; or
1071 * b) last attempt to allocate a block from existing reservation
1072 * failed; or
1073 * c) we come here with a goal and with a reservation window
1074 *
1075 * We do not need to allocate a new reservation window if we come here
1076 * at the beginning with a goal and the goal is inside the window, or
1077 * we don't have a goal but already have a reservation window.
1078 * then we could go to allocate from the reservation window directly.
1079 */
1080 while (1) {
1081 struct ext3_reserve_window rsv_copy;
1082
1083 rsv_copy._rsv_start = my_rsv->rsv_start;
1084 rsv_copy._rsv_end = my_rsv->rsv_end;
1085
1086 if (rsv_is_empty(&rsv_copy) || (ret < 0) ||
1087 !goal_in_my_reservation(&rsv_copy, goal, group, sb)) {
1088 spin_lock(rsv_lock);
1089 ret = alloc_new_reservation(my_rsv, goal, sb,
1090 group, bitmap_bh);
1091 rsv_copy._rsv_start = my_rsv->rsv_start;
1092 rsv_copy._rsv_end = my_rsv->rsv_end;
1093 spin_unlock(rsv_lock);
1094 if (ret < 0)
1095 break; /* failed */
1096
1097 if (!goal_in_my_reservation(&rsv_copy, goal, group, sb))
1098 goal = -1;
1099 }
1100 if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
1101 || (rsv_copy._rsv_end < group_first_block))
1102 BUG();
1103 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
1104 &rsv_copy);
1105 if (ret >= 0) {
1106 my_rsv->rsv_alloc_hit++;
1107 break; /* succeed */
1108 }
1109 }
1110out:
1111 if (ret >= 0) {
1112 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1113 "bitmap block");
1114 fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
1115 if (fatal) {
1116 *errp = fatal;
1117 return -1;
1118 }
1119 return ret;
1120 }
1121
1122 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1123 ext3_journal_release_buffer(handle, bitmap_bh);
1124 return ret;
1125}
1126
1127static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
1128{
1129 int free_blocks, root_blocks;
1130
1131 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1132 root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
1133 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1134 sbi->s_resuid != current->fsuid &&
1135 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
1136 return 0;
1137 }
1138 return 1;
1139}
1140
1141/*
1142 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
1143 * it is profitable to retry the operation, this function will wait
1144 * for the current or commiting transaction to complete, and then
1145 * return TRUE.
1146 */
1147int ext3_should_retry_alloc(struct super_block *sb, int *retries)
1148{
1149 if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
1150 return 0;
1151
1152 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1153
1154 return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
1155}
1156
1157/*
1158 * ext3_new_block uses a goal block to assist allocation. If the goal is
1159 * free, or there is a free block within 32 blocks of the goal, that block
1160 * is allocated. Otherwise a forward search is made for a free block; within
1161 * each block group the search first looks for an entire free byte in the block
1162 * bitmap, and then for any free bit if that fails.
1163 * This function also updates quota and i_blocks field.
1164 */
1165int ext3_new_block(handle_t *handle, struct inode *inode,
1166 unsigned long goal, int *errp)
1167{
1168 struct buffer_head *bitmap_bh = NULL;
1169 struct buffer_head *gdp_bh;
1170 int group_no;
1171 int goal_group;
1172 int ret_block;
1173 int bgi; /* blockgroup iteration index */
1174 int target_block;
1175 int fatal = 0, err;
1176 int performed_allocation = 0;
1177 int free_blocks;
1178 struct super_block *sb;
1179 struct ext3_group_desc *gdp;
1180 struct ext3_super_block *es;
1181 struct ext3_sb_info *sbi;
1182 struct ext3_reserve_window_node *my_rsv = NULL;
1183 struct ext3_block_alloc_info *block_i;
1184 unsigned short windowsz = 0;
1185#ifdef EXT3FS_DEBUG
1186 static int goal_hits, goal_attempts;
1187#endif
1188 unsigned long ngroups;
1189
1190 *errp = -ENOSPC;
1191 sb = inode->i_sb;
1192 if (!sb) {
1193 printk("ext3_new_block: nonexistent device");
1194 return 0;
1195 }
1196
1197 /*
1198 * Check quota for allocation of this block.
1199 */
1200 if (DQUOT_ALLOC_BLOCK(inode, 1)) {
1201 *errp = -EDQUOT;
1202 return 0;
1203 }
1204
1205 sbi = EXT3_SB(sb);
1206 es = EXT3_SB(sb)->s_es;
1207 ext3_debug("goal=%lu.\n", goal);
1208 /*
1209 * Allocate a block from reservation only when
1210 * filesystem is mounted with reservation(default,-o reservation), and
1211 * it's a regular file, and
1212 * the desired window size is greater than 0 (One could use ioctl
1213 * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
1214 * reservation on that particular file)
1215 */
1216 block_i = EXT3_I(inode)->i_block_alloc_info;
1217 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1218 my_rsv = &block_i->rsv_window_node;
1219
1220 if (!ext3_has_free_blocks(sbi)) {
1221 *errp = -ENOSPC;
1222 goto out;
1223 }
1224
1225 /*
1226 * First, test whether the goal block is free.
1227 */
1228 if (goal < le32_to_cpu(es->s_first_data_block) ||
1229 goal >= le32_to_cpu(es->s_blocks_count))
1230 goal = le32_to_cpu(es->s_first_data_block);
1231 group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
1232 EXT3_BLOCKS_PER_GROUP(sb);
1233 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1234 if (!gdp)
1235 goto io_error;
1236
1237 goal_group = group_no;
1238retry:
1239 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1240 /*
1241 * if there is not enough free blocks to make a new resevation
1242 * turn off reservation for this allocation
1243 */
1244 if (my_rsv && (free_blocks < windowsz)
1245 && (rsv_is_empty(&my_rsv->rsv_window)))
1246 my_rsv = NULL;
1247
1248 if (free_blocks > 0) {
1249 ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
1250 EXT3_BLOCKS_PER_GROUP(sb));
1251 bitmap_bh = read_block_bitmap(sb, group_no);
1252 if (!bitmap_bh)
1253 goto io_error;
1254 ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
1255 bitmap_bh, ret_block, my_rsv, &fatal);
1256 if (fatal)
1257 goto out;
1258 if (ret_block >= 0)
1259 goto allocated;
1260 }
1261
1262 ngroups = EXT3_SB(sb)->s_groups_count;
1263 smp_rmb();
1264
1265 /*
1266 * Now search the rest of the groups. We assume that
1267 * i and gdp correctly point to the last group visited.
1268 */
1269 for (bgi = 0; bgi < ngroups; bgi++) {
1270 group_no++;
1271 if (group_no >= ngroups)
1272 group_no = 0;
1273 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1274 if (!gdp) {
1275 *errp = -EIO;
1276 goto out;
1277 }
1278 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1279 /*
1280 * skip this group if the number of
1281 * free blocks is less than half of the reservation
1282 * window size.
1283 */
1284 if (free_blocks <= (windowsz/2))
1285 continue;
1286
1287 brelse(bitmap_bh);
1288 bitmap_bh = read_block_bitmap(sb, group_no);
1289 if (!bitmap_bh)
1290 goto io_error;
1291 ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
1292 bitmap_bh, -1, my_rsv, &fatal);
1293 if (fatal)
1294 goto out;
1295 if (ret_block >= 0)
1296 goto allocated;
1297 }
1298 /*
1299 * We may end up a bogus ealier ENOSPC error due to
1300 * filesystem is "full" of reservations, but
1301 * there maybe indeed free blocks avaliable on disk
1302 * In this case, we just forget about the reservations
1303 * just do block allocation as without reservations.
1304 */
1305 if (my_rsv) {
1306 my_rsv = NULL;
1307 group_no = goal_group;
1308 goto retry;
1309 }
1310 /* No space left on the device */
1311 *errp = -ENOSPC;
1312 goto out;
1313
1314allocated:
1315
1316 ext3_debug("using block group %d(%d)\n",
1317 group_no, gdp->bg_free_blocks_count);
1318
1319 BUFFER_TRACE(gdp_bh, "get_write_access");
1320 fatal = ext3_journal_get_write_access(handle, gdp_bh);
1321 if (fatal)
1322 goto out;
1323
1324 target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
1325 + le32_to_cpu(es->s_first_data_block);
1326
1327 if (target_block == le32_to_cpu(gdp->bg_block_bitmap) ||
1328 target_block == le32_to_cpu(gdp->bg_inode_bitmap) ||
1329 in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
1330 EXT3_SB(sb)->s_itb_per_group))
1331 ext3_error(sb, "ext3_new_block",
1332 "Allocating block in system zone - "
1333 "block = %u", target_block);
1334
1335 performed_allocation = 1;
1336
1337#ifdef CONFIG_JBD_DEBUG
1338 {
1339 struct buffer_head *debug_bh;
1340
1341 /* Record bitmap buffer state in the newly allocated block */
1342 debug_bh = sb_find_get_block(sb, target_block);
1343 if (debug_bh) {
1344 BUFFER_TRACE(debug_bh, "state when allocated");
1345 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1346 brelse(debug_bh);
1347 }
1348 }
1349 jbd_lock_bh_state(bitmap_bh);
1350 spin_lock(sb_bgl_lock(sbi, group_no));
1351 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1352 if (ext3_test_bit(ret_block,
1353 bh2jh(bitmap_bh)->b_committed_data)) {
1354 printk("%s: block was unexpectedly set in "
1355 "b_committed_data\n", __FUNCTION__);
1356 }
1357 }
1358 ext3_debug("found bit %d\n", ret_block);
1359 spin_unlock(sb_bgl_lock(sbi, group_no));
1360 jbd_unlock_bh_state(bitmap_bh);
1361#endif
1362
1363 /* ret_block was blockgroup-relative. Now it becomes fs-relative */
1364 ret_block = target_block;
1365
1366 if (ret_block >= le32_to_cpu(es->s_blocks_count)) {
1367 ext3_error(sb, "ext3_new_block",
1368 "block(%d) >= blocks count(%d) - "
1369 "block_group = %d, es == %p ", ret_block,
1370 le32_to_cpu(es->s_blocks_count), group_no, es);
1371 goto out;
1372 }
1373
1374 /*
1375 * It is up to the caller to add the new buffer to a journal
1376 * list of some description. We don't know in advance whether
1377 * the caller wants to use it as metadata or data.
1378 */
1379 ext3_debug("allocating block %d. Goal hits %d of %d.\n",
1380 ret_block, goal_hits, goal_attempts);
1381
1382 spin_lock(sb_bgl_lock(sbi, group_no));
1383 gdp->bg_free_blocks_count =
1384 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
1385 spin_unlock(sb_bgl_lock(sbi, group_no));
1386 percpu_counter_mod(&sbi->s_freeblocks_counter, -1);
1387
1388 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1389 err = ext3_journal_dirty_metadata(handle, gdp_bh);
1390 if (!fatal)
1391 fatal = err;
1392
1393 sb->s_dirt = 1;
1394 if (fatal)
1395 goto out;
1396
1397 *errp = 0;
1398 brelse(bitmap_bh);
1399 return ret_block;
1400
1401io_error:
1402 *errp = -EIO;
1403out:
1404 if (fatal) {
1405 *errp = fatal;
1406 ext3_std_error(sb, fatal);
1407 }
1408 /*
1409 * Undo the block allocation
1410 */
1411 if (!performed_allocation)
1412 DQUOT_FREE_BLOCK(inode, 1);
1413 brelse(bitmap_bh);
1414 return 0;
1415}
1416
1417unsigned long ext3_count_free_blocks(struct super_block *sb)
1418{
1419 unsigned long desc_count;
1420 struct ext3_group_desc *gdp;
1421 int i;
1422 unsigned long ngroups;
1423#ifdef EXT3FS_DEBUG
1424 struct ext3_super_block *es;
1425 unsigned long bitmap_count, x;
1426 struct buffer_head *bitmap_bh = NULL;
1427
1428 lock_super(sb);
1429 es = EXT3_SB(sb)->s_es;
1430 desc_count = 0;
1431 bitmap_count = 0;
1432 gdp = NULL;
1433 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
1434 gdp = ext3_get_group_desc(sb, i, NULL);
1435 if (!gdp)
1436 continue;
1437 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1438 brelse(bitmap_bh);
1439 bitmap_bh = read_block_bitmap(sb, i);
1440 if (bitmap_bh == NULL)
1441 continue;
1442
1443 x = ext3_count_free(bitmap_bh, sb->s_blocksize);
1444 printk("group %d: stored = %d, counted = %lu\n",
1445 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1446 bitmap_count += x;
1447 }
1448 brelse(bitmap_bh);
1449 printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
1450 le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
1451 unlock_super(sb);
1452 return bitmap_count;
1453#else
1454 desc_count = 0;
1455 ngroups = EXT3_SB(sb)->s_groups_count;
1456 smp_rmb();
1457 for (i = 0; i < ngroups; i++) {
1458 gdp = ext3_get_group_desc(sb, i, NULL);
1459 if (!gdp)
1460 continue;
1461 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1462 }
1463
1464 return desc_count;
1465#endif
1466}
1467
1468static inline int
1469block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
1470{
1471 return ext3_test_bit ((block -
1472 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
1473 EXT3_BLOCKS_PER_GROUP(sb), map);
1474}
1475
1476static inline int test_root(int a, int b)
1477{
1478 int num = b;
1479
1480 while (a > num)
1481 num *= b;
1482 return num == a;
1483}
1484
1485static int ext3_group_sparse(int group)
1486{
1487 if (group <= 1)
1488 return 1;
1489 if (!(group & 1))
1490 return 0;
1491 return (test_root(group, 7) || test_root(group, 5) ||
1492 test_root(group, 3));
1493}
1494
1495/**
1496 * ext3_bg_has_super - number of blocks used by the superblock in group
1497 * @sb: superblock for filesystem
1498 * @group: group number to check
1499 *
1500 * Return the number of blocks used by the superblock (primary or backup)
1501 * in this group. Currently this will be only 0 or 1.
1502 */
1503int ext3_bg_has_super(struct super_block *sb, int group)
1504{
1505 if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
1506 !ext3_group_sparse(group))
1507 return 0;
1508 return 1;
1509}
1510
1511/**
1512 * ext3_bg_num_gdb - number of blocks used by the group table in group
1513 * @sb: superblock for filesystem
1514 * @group: group number to check
1515 *
1516 * Return the number of blocks used by the group descriptor table
1517 * (primary or backup) in this group. In the future there may be a
1518 * different number of descriptor blocks in each group.
1519 */
1520unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1521{
1522 if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
1523 !ext3_group_sparse(group))
1524 return 0;
1525 return EXT3_SB(sb)->s_gdb_count;
1526}
1527
1528#ifdef CONFIG_EXT3_CHECK
1529/* Called at mount-time, super-block is locked */
1530void ext3_check_blocks_bitmap (struct super_block * sb)
1531{
1532 struct ext3_super_block *es;
1533 unsigned long desc_count, bitmap_count, x, j;
1534 unsigned long desc_blocks;
1535 struct buffer_head *bitmap_bh = NULL;
1536 struct ext3_group_desc *gdp;
1537 int i;
1538
1539 es = EXT3_SB(sb)->s_es;
1540 desc_count = 0;
1541 bitmap_count = 0;
1542 gdp = NULL;
1543 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
1544 gdp = ext3_get_group_desc (sb, i, NULL);
1545 if (!gdp)
1546 continue;
1547 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1548 brelse(bitmap_bh);
1549 bitmap_bh = read_block_bitmap(sb, i);
1550 if (bitmap_bh == NULL)
1551 continue;
1552
1553 if (ext3_bg_has_super(sb, i) &&
1554 !ext3_test_bit(0, bitmap_bh->b_data))
1555 ext3_error(sb, __FUNCTION__,
1556 "Superblock in group %d is marked free", i);
1557
1558 desc_blocks = ext3_bg_num_gdb(sb, i);
1559 for (j = 0; j < desc_blocks; j++)
1560 if (!ext3_test_bit(j + 1, bitmap_bh->b_data))
1561 ext3_error(sb, __FUNCTION__,
1562 "Descriptor block #%ld in group "
1563 "%d is marked free", j, i);
1564
1565 if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap),
1566 sb, bitmap_bh->b_data))
1567 ext3_error (sb, "ext3_check_blocks_bitmap",
1568 "Block bitmap for group %d is marked free",
1569 i);
1570
1571 if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap),
1572 sb, bitmap_bh->b_data))
1573 ext3_error (sb, "ext3_check_blocks_bitmap",
1574 "Inode bitmap for group %d is marked free",
1575 i);
1576
1577 for (j = 0; j < EXT3_SB(sb)->s_itb_per_group; j++)
1578 if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
1579 sb, bitmap_bh->b_data))
1580 ext3_error (sb, "ext3_check_blocks_bitmap",
1581 "Block #%d of the inode table in "
1582 "group %d is marked free", j, i);
1583
1584 x = ext3_count_free(bitmap_bh, sb->s_blocksize);
1585 if (le16_to_cpu(gdp->bg_free_blocks_count) != x)
1586 ext3_error (sb, "ext3_check_blocks_bitmap",
1587 "Wrong free blocks count for group %d, "
1588 "stored = %d, counted = %lu", i,
1589 le16_to_cpu(gdp->bg_free_blocks_count), x);
1590 bitmap_count += x;
1591 }
1592 brelse(bitmap_bh);
1593 if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count)
1594 ext3_error (sb, "ext3_check_blocks_bitmap",
1595 "Wrong free blocks count in super block, "
1596 "stored = %lu, counted = %lu",
1597 (unsigned long)le32_to_cpu(es->s_free_blocks_count),
1598 bitmap_count);
1599}
1600#endif
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
new file mode 100644
index 000000000000..6c419b9ab0e8
--- /dev/null
+++ b/fs/ext3/bitmap.c
@@ -0,0 +1,26 @@
1/*
2 * linux/fs/ext3/bitmap.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/buffer_head.h>
11
12
13static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
14
15unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
16{
17 unsigned int i;
18 unsigned long sum = 0;
19
20 if (!map)
21 return (0);
22 for (i = 0; i < numchars; i++)
23 sum += nibblemap[map->b_data[i] & 0xf] +
24 nibblemap[(map->b_data[i] >> 4) & 0xf];
25 return (sum);
26}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
new file mode 100644
index 000000000000..832867aef3dc
--- /dev/null
+++ b/fs/ext3/dir.c
@@ -0,0 +1,519 @@
1/*
2 * linux/fs/ext3/dir.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/dir.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext3 directory handling functions
16 *
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 *
20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips
21 *
22 */
23
24#include <linux/fs.h>
25#include <linux/jbd.h>
26#include <linux/ext3_fs.h>
27#include <linux/buffer_head.h>
28#include <linux/smp_lock.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32static unsigned char ext3_filetype_table[] = {
33 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
34};
35
36static int ext3_readdir(struct file *, void *, filldir_t);
37static int ext3_dx_readdir(struct file * filp,
38 void * dirent, filldir_t filldir);
39static int ext3_release_dir (struct inode * inode,
40 struct file * filp);
41
42struct file_operations ext3_dir_operations = {
43 .llseek = generic_file_llseek,
44 .read = generic_read_dir,
45 .readdir = ext3_readdir, /* we take BKL. needed?*/
46 .ioctl = ext3_ioctl, /* BKL held */
47 .fsync = ext3_sync_file, /* BKL held */
48#ifdef CONFIG_EXT3_INDEX
49 .release = ext3_release_dir,
50#endif
51};
52
53
54static unsigned char get_dtype(struct super_block *sb, int filetype)
55{
56 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
57 (filetype >= EXT3_FT_MAX))
58 return DT_UNKNOWN;
59
60 return (ext3_filetype_table[filetype]);
61}
62
63
64int ext3_check_dir_entry (const char * function, struct inode * dir,
65 struct ext3_dir_entry_2 * de,
66 struct buffer_head * bh,
67 unsigned long offset)
68{
69 const char * error_msg = NULL;
70 const int rlen = le16_to_cpu(de->rec_len);
71
72 if (rlen < EXT3_DIR_REC_LEN(1))
73 error_msg = "rec_len is smaller than minimal";
74 else if (rlen % 4 != 0)
75 error_msg = "rec_len % 4 != 0";
76 else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
77 error_msg = "rec_len is too small for name_len";
78 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
79 error_msg = "directory entry across blocks";
80 else if (le32_to_cpu(de->inode) >
81 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
82 error_msg = "inode out of bounds";
83
84 if (error_msg != NULL)
85 ext3_error (dir->i_sb, function,
86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset,
89 (unsigned long) le32_to_cpu(de->inode),
90 rlen, de->name_len);
91 return error_msg == NULL ? 1 : 0;
92}
93
94static int ext3_readdir(struct file * filp,
95 void * dirent, filldir_t filldir)
96{
97 int error = 0;
98 unsigned long offset, blk;
99 int i, num, stored;
100 struct buffer_head * bh, * tmp, * bha[16];
101 struct ext3_dir_entry_2 * de;
102 struct super_block * sb;
103 int err;
104 struct inode *inode = filp->f_dentry->d_inode;
105 int ret = 0;
106
107 sb = inode->i_sb;
108
109#ifdef CONFIG_EXT3_INDEX
110 if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
111 EXT3_FEATURE_COMPAT_DIR_INDEX) &&
112 ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
113 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
114 err = ext3_dx_readdir(filp, dirent, filldir);
115 if (err != ERR_BAD_DX_DIR) {
116 ret = err;
117 goto out;
118 }
119 /*
120 * We don't set the inode dirty flag since it's not
121 * critical that it get flushed back to the disk.
122 */
123 EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
124 }
125#endif
126 stored = 0;
127 bh = NULL;
128 offset = filp->f_pos & (sb->s_blocksize - 1);
129
130 while (!error && !stored && filp->f_pos < inode->i_size) {
131 blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb);
132 bh = ext3_bread(NULL, inode, blk, 0, &err);
133 if (!bh) {
134 ext3_error (sb, "ext3_readdir",
135 "directory #%lu contains a hole at offset %lu",
136 inode->i_ino, (unsigned long)filp->f_pos);
137 filp->f_pos += sb->s_blocksize - offset;
138 continue;
139 }
140
141 /*
142 * Do the readahead
143 */
144 if (!offset) {
145 for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0;
146 i > 0; i--) {
147 tmp = ext3_getblk (NULL, inode, ++blk, 0, &err);
148 if (tmp && !buffer_uptodate(tmp) &&
149 !buffer_locked(tmp))
150 bha[num++] = tmp;
151 else
152 brelse (tmp);
153 }
154 if (num) {
155 ll_rw_block (READA, num, bha);
156 for (i = 0; i < num; i++)
157 brelse (bha[i]);
158 }
159 }
160
161revalidate:
162 /* If the dir block has changed since the last call to
163 * readdir(2), then we might be pointing to an invalid
164 * dirent right now. Scan from the start of the block
165 * to make sure. */
166 if (filp->f_version != inode->i_version) {
167 for (i = 0; i < sb->s_blocksize && i < offset; ) {
168 de = (struct ext3_dir_entry_2 *)
169 (bh->b_data + i);
170 /* It's too expensive to do a full
171 * dirent test each time round this
172 * loop, but we do have to test at
173 * least that it is non-zero. A
174 * failure will be detected in the
175 * dirent test below. */
176 if (le16_to_cpu(de->rec_len) <
177 EXT3_DIR_REC_LEN(1))
178 break;
179 i += le16_to_cpu(de->rec_len);
180 }
181 offset = i;
182 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
183 | offset;
184 filp->f_version = inode->i_version;
185 }
186
187 while (!error && filp->f_pos < inode->i_size
188 && offset < sb->s_blocksize) {
189 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
190 if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
191 bh, offset)) {
192 /* On error, skip the f_pos to the
193 next block. */
194 filp->f_pos = (filp->f_pos |
195 (sb->s_blocksize - 1)) + 1;
196 brelse (bh);
197 ret = stored;
198 goto out;
199 }
200 offset += le16_to_cpu(de->rec_len);
201 if (le32_to_cpu(de->inode)) {
202 /* We might block in the next section
203 * if the data destination is
204 * currently swapped out. So, use a
205 * version stamp to detect whether or
206 * not the directory has been modified
207 * during the copy operation.
208 */
209 unsigned long version = filp->f_version;
210
211 error = filldir(dirent, de->name,
212 de->name_len,
213 filp->f_pos,
214 le32_to_cpu(de->inode),
215 get_dtype(sb, de->file_type));
216 if (error)
217 break;
218 if (version != filp->f_version)
219 goto revalidate;
220 stored ++;
221 }
222 filp->f_pos += le16_to_cpu(de->rec_len);
223 }
224 offset = 0;
225 brelse (bh);
226 }
227out:
228 return ret;
229}
230
231#ifdef CONFIG_EXT3_INDEX
232/*
233 * These functions convert from the major/minor hash to an f_pos
234 * value.
235 *
236 * Currently we only use major hash numer. This is unfortunate, but
237 * on 32-bit machines, the same VFS interface is used for lseek and
238 * llseek, so if we use the 64 bit offset, then the 32-bit versions of
239 * lseek/telldir/seekdir will blow out spectacularly, and from within
240 * the ext2 low-level routine, we don't know if we're being called by
241 * a 64-bit version of the system call or the 32-bit version of the
242 * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
243 * cookie. Sigh.
244 */
245#define hash2pos(major, minor) (major >> 1)
246#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
247#define pos2min_hash(pos) (0)
248
249/*
250 * This structure holds the nodes of the red-black tree used to store
251 * the directory entry in hash order.
252 */
253struct fname {
254 __u32 hash;
255 __u32 minor_hash;
256 struct rb_node rb_hash;
257 struct fname *next;
258 __u32 inode;
259 __u8 name_len;
260 __u8 file_type;
261 char name[0];
262};
263
264/*
265 * This functoin implements a non-recursive way of freeing all of the
266 * nodes in the red-black tree.
267 */
268static void free_rb_tree_fname(struct rb_root *root)
269{
270 struct rb_node *n = root->rb_node;
271 struct rb_node *parent;
272 struct fname *fname;
273
274 while (n) {
275 /* Do the node's children first */
276 if ((n)->rb_left) {
277 n = n->rb_left;
278 continue;
279 }
280 if (n->rb_right) {
281 n = n->rb_right;
282 continue;
283 }
284 /*
285 * The node has no children; free it, and then zero
286 * out parent's link to it. Finally go to the
287 * beginning of the loop and try to free the parent
288 * node.
289 */
290 parent = n->rb_parent;
291 fname = rb_entry(n, struct fname, rb_hash);
292 while (fname) {
293 struct fname * old = fname;
294 fname = fname->next;
295 kfree (old);
296 }
297 if (!parent)
298 root->rb_node = NULL;
299 else if (parent->rb_left == n)
300 parent->rb_left = NULL;
301 else if (parent->rb_right == n)
302 parent->rb_right = NULL;
303 n = parent;
304 }
305 root->rb_node = NULL;
306}
307
308
309static struct dir_private_info *create_dir_info(loff_t pos)
310{
311 struct dir_private_info *p;
312
313 p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
314 if (!p)
315 return NULL;
316 p->root.rb_node = NULL;
317 p->curr_node = NULL;
318 p->extra_fname = NULL;
319 p->last_pos = 0;
320 p->curr_hash = pos2maj_hash(pos);
321 p->curr_minor_hash = pos2min_hash(pos);
322 p->next_hash = 0;
323 return p;
324}
325
326void ext3_htree_free_dir_info(struct dir_private_info *p)
327{
328 free_rb_tree_fname(&p->root);
329 kfree(p);
330}
331
332/*
333 * Given a directory entry, enter it into the fname rb tree.
334 */
335int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
336 __u32 minor_hash,
337 struct ext3_dir_entry_2 *dirent)
338{
339 struct rb_node **p, *parent = NULL;
340 struct fname * fname, *new_fn;
341 struct dir_private_info *info;
342 int len;
343
344 info = (struct dir_private_info *) dir_file->private_data;
345 p = &info->root.rb_node;
346
347 /* Create and allocate the fname structure */
348 len = sizeof(struct fname) + dirent->name_len + 1;
349 new_fn = kmalloc(len, GFP_KERNEL);
350 if (!new_fn)
351 return -ENOMEM;
352 memset(new_fn, 0, len);
353 new_fn->hash = hash;
354 new_fn->minor_hash = minor_hash;
355 new_fn->inode = le32_to_cpu(dirent->inode);
356 new_fn->name_len = dirent->name_len;
357 new_fn->file_type = dirent->file_type;
358 memcpy(new_fn->name, dirent->name, dirent->name_len);
359 new_fn->name[dirent->name_len] = 0;
360
361 while (*p) {
362 parent = *p;
363 fname = rb_entry(parent, struct fname, rb_hash);
364
365 /*
366 * If the hash and minor hash match up, then we put
367 * them on a linked list. This rarely happens...
368 */
369 if ((new_fn->hash == fname->hash) &&
370 (new_fn->minor_hash == fname->minor_hash)) {
371 new_fn->next = fname->next;
372 fname->next = new_fn;
373 return 0;
374 }
375
376 if (new_fn->hash < fname->hash)
377 p = &(*p)->rb_left;
378 else if (new_fn->hash > fname->hash)
379 p = &(*p)->rb_right;
380 else if (new_fn->minor_hash < fname->minor_hash)
381 p = &(*p)->rb_left;
382 else /* if (new_fn->minor_hash > fname->minor_hash) */
383 p = &(*p)->rb_right;
384 }
385
386 rb_link_node(&new_fn->rb_hash, parent, p);
387 rb_insert_color(&new_fn->rb_hash, &info->root);
388 return 0;
389}
390
391
392
393/*
394 * This is a helper function for ext3_dx_readdir. It calls filldir
395 * for all entres on the fname linked list. (Normally there is only
396 * one entry on the linked list, unless there are 62 bit hash collisions.)
397 */
398static int call_filldir(struct file * filp, void * dirent,
399 filldir_t filldir, struct fname *fname)
400{
401 struct dir_private_info *info = filp->private_data;
402 loff_t curr_pos;
403 struct inode *inode = filp->f_dentry->d_inode;
404 struct super_block * sb;
405 int error;
406
407 sb = inode->i_sb;
408
409 if (!fname) {
410 printk("call_filldir: called with null fname?!?\n");
411 return 0;
412 }
413 curr_pos = hash2pos(fname->hash, fname->minor_hash);
414 while (fname) {
415 error = filldir(dirent, fname->name,
416 fname->name_len, curr_pos,
417 fname->inode,
418 get_dtype(sb, fname->file_type));
419 if (error) {
420 filp->f_pos = curr_pos;
421 info->extra_fname = fname->next;
422 return error;
423 }
424 fname = fname->next;
425 }
426 return 0;
427}
428
429static int ext3_dx_readdir(struct file * filp,
430 void * dirent, filldir_t filldir)
431{
432 struct dir_private_info *info = filp->private_data;
433 struct inode *inode = filp->f_dentry->d_inode;
434 struct fname *fname;
435 int ret;
436
437 if (!info) {
438 info = create_dir_info(filp->f_pos);
439 if (!info)
440 return -ENOMEM;
441 filp->private_data = info;
442 }
443
444 if (filp->f_pos == EXT3_HTREE_EOF)
445 return 0; /* EOF */
446
447 /* Some one has messed with f_pos; reset the world */
448 if (info->last_pos != filp->f_pos) {
449 free_rb_tree_fname(&info->root);
450 info->curr_node = NULL;
451 info->extra_fname = NULL;
452 info->curr_hash = pos2maj_hash(filp->f_pos);
453 info->curr_minor_hash = pos2min_hash(filp->f_pos);
454 }
455
456 /*
457 * If there are any leftover names on the hash collision
458 * chain, return them first.
459 */
460 if (info->extra_fname &&
461 call_filldir(filp, dirent, filldir, info->extra_fname))
462 goto finished;
463
464 if (!info->curr_node)
465 info->curr_node = rb_first(&info->root);
466
467 while (1) {
468 /*
469 * Fill the rbtree if we have no more entries,
470 * or the inode has changed since we last read in the
471 * cached entries.
472 */
473 if ((!info->curr_node) ||
474 (filp->f_version != inode->i_version)) {
475 info->curr_node = NULL;
476 free_rb_tree_fname(&info->root);
477 filp->f_version = inode->i_version;
478 ret = ext3_htree_fill_tree(filp, info->curr_hash,
479 info->curr_minor_hash,
480 &info->next_hash);
481 if (ret < 0)
482 return ret;
483 if (ret == 0) {
484 filp->f_pos = EXT3_HTREE_EOF;
485 break;
486 }
487 info->curr_node = rb_first(&info->root);
488 }
489
490 fname = rb_entry(info->curr_node, struct fname, rb_hash);
491 info->curr_hash = fname->hash;
492 info->curr_minor_hash = fname->minor_hash;
493 if (call_filldir(filp, dirent, filldir, fname))
494 break;
495
496 info->curr_node = rb_next(info->curr_node);
497 if (!info->curr_node) {
498 if (info->next_hash == ~0) {
499 filp->f_pos = EXT3_HTREE_EOF;
500 break;
501 }
502 info->curr_hash = info->next_hash;
503 info->curr_minor_hash = 0;
504 }
505 }
506finished:
507 info->last_pos = filp->f_pos;
508 return 0;
509}
510
511static int ext3_release_dir (struct inode * inode, struct file * filp)
512{
513 if (filp->private_data)
514 ext3_htree_free_dir_info(filp->private_data);
515
516 return 0;
517}
518
519#endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
new file mode 100644
index 000000000000..5ad8cf0292df
--- /dev/null
+++ b/fs/ext3/file.c
@@ -0,0 +1,131 @@
1/*
2 * linux/fs/ext3/file.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/file.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext3 fs regular file handling primitives
16 *
17 * 64-bit file support on 64-bit platforms by Jakub Jelinek
18 * (jj@sunsite.ms.mff.cuni.cz)
19 */
20
21#include <linux/time.h>
22#include <linux/fs.h>
23#include <linux/jbd.h>
24#include <linux/ext3_fs.h>
25#include <linux/ext3_jbd.h>
26#include "xattr.h"
27#include "acl.h"
28
29/*
30 * Called when an inode is released. Note that this is different
31 * from ext3_file_open: open gets called at every open, but release
32 * gets called only when /all/ the files are closed.
33 */
34static int ext3_release_file (struct inode * inode, struct file * filp)
35{
36 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1))
39 ext3_discard_reservation(inode);
40 if (is_dx(inode) && filp->private_data)
41 ext3_htree_free_dir_info(filp->private_data);
42
43 return 0;
44}
45
46static ssize_t
47ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
48{
49 struct file *file = iocb->ki_filp;
50 struct inode *inode = file->f_dentry->d_inode;
51 ssize_t ret;
52 int err;
53
54 ret = generic_file_aio_write(iocb, buf, count, pos);
55
56 /*
57 * Skip flushing if there was an error, or if nothing was written.
58 */
59 if (ret <= 0)
60 return ret;
61
62 /*
63 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
64 * journalling then we need to make sure that we force the transaction
65 * to disk to keep all metadata uptodate synchronously.
66 */
67 if (file->f_flags & O_SYNC) {
68 /*
69 * If we are non-data-journaled, then the dirty data has
70 * already been flushed to backing store by generic_osync_inode,
71 * and the inode has been flushed too if there have been any
72 * modifications other than mere timestamp updates.
73 *
74 * Open question --- do we care about flushing timestamps too
75 * if the inode is IS_SYNC?
76 */
77 if (!ext3_should_journal_data(inode))
78 return ret;
79
80 goto force_commit;
81 }
82
83 /*
84 * So we know that there has been no forced data flush. If the inode
85 * is marked IS_SYNC, we need to force one ourselves.
86 */
87 if (!IS_SYNC(inode))
88 return ret;
89
90 /*
91 * Open question #2 --- should we force data to disk here too? If we
92 * don't, the only impact is that data=writeback filesystems won't
93 * flush data to disk automatically on IS_SYNC, only metadata (but
94 * historically, that is what ext2 has done.)
95 */
96
97force_commit:
98 err = ext3_force_commit(inode->i_sb);
99 if (err)
100 return err;
101 return ret;
102}
103
104struct file_operations ext3_file_operations = {
105 .llseek = generic_file_llseek,
106 .read = do_sync_read,
107 .write = do_sync_write,
108 .aio_read = generic_file_aio_read,
109 .aio_write = ext3_file_write,
110 .readv = generic_file_readv,
111 .writev = generic_file_writev,
112 .ioctl = ext3_ioctl,
113 .mmap = generic_file_mmap,
114 .open = generic_file_open,
115 .release = ext3_release_file,
116 .fsync = ext3_sync_file,
117 .sendfile = generic_file_sendfile,
118};
119
120struct inode_operations ext3_file_inode_operations = {
121 .truncate = ext3_truncate,
122 .setattr = ext3_setattr,
123#ifdef CONFIG_EXT3_FS_XATTR
124 .setxattr = generic_setxattr,
125 .getxattr = generic_getxattr,
126 .listxattr = ext3_listxattr,
127 .removexattr = generic_removexattr,
128#endif
129 .permission = ext3_permission,
130};
131
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
new file mode 100644
index 000000000000..49382a208e05
--- /dev/null
+++ b/fs/ext3/fsync.c
@@ -0,0 +1,88 @@
1/*
2 * linux/fs/ext3/fsync.c
3 *
4 * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
5 * from
6 * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 * from
10 * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
11 *
12 * ext3fs fsync primitive
13 *
14 * Big-endian to little-endian byte-swapping/bitmaps by
15 * David S. Miller (davem@caip.rutgers.edu), 1995
16 *
17 * Removed unnecessary code duplication for little endian machines
18 * and excessive __inline__s.
19 * Andi Kleen, 1997
20 *
21 * Major simplications and cleanup - we only need to do the metadata, because
22 * we can depend on generic_block_fdatasync() to sync the data blocks.
23 */
24
25#include <linux/time.h>
26#include <linux/fs.h>
27#include <linux/sched.h>
28#include <linux/writeback.h>
29#include <linux/jbd.h>
30#include <linux/ext3_fs.h>
31#include <linux/ext3_jbd.h>
32
33/*
34 * akpm: A new design for ext3_sync_file().
35 *
36 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
37 * There cannot be a transaction open by this task.
38 * Another task could have dirtied this inode. Its data can be in any
39 * state in the journalling system.
40 *
41 * What we do is just kick off a commit and wait on it. This will snapshot the
42 * inode to disk.
43 */
44
45int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
46{
47 struct inode *inode = dentry->d_inode;
48 int ret = 0;
49
50 J_ASSERT(ext3_journal_current_handle() == 0);
51
52 /*
53 * data=writeback:
54 * The caller's filemap_fdatawrite()/wait will sync the data.
55 * sync_inode() will sync the metadata
56 *
57 * data=ordered:
58 * The caller's filemap_fdatawrite() will write the data and
59 * sync_inode() will write the inode if it is dirty. Then the caller's
60 * filemap_fdatawait() will wait on the pages.
61 *
62 * data=journal:
63 * filemap_fdatawrite won't do anything (the buffers are clean).
64 * ext3_force_commit will write the file data into the journal and
65 * will wait on that.
66 * filemap_fdatawait() will encounter a ton of newly-dirtied pages
67 * (they were dirtied by commit). But that's OK - the blocks are
68 * safe in-journal, which is all fsync() needs to ensure.
69 */
70 if (ext3_should_journal_data(inode)) {
71 ret = ext3_force_commit(inode->i_sb);
72 goto out;
73 }
74
75 /*
76 * The VFS has written the file data. If the inode is unaltered
77 * then we need not start a commit.
78 */
79 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
80 struct writeback_control wbc = {
81 .sync_mode = WB_SYNC_ALL,
82 .nr_to_write = 0, /* sys_fsync did this */
83 };
84 ret = sync_inode(inode, &wbc);
85 }
86out:
87 return ret;
88}
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
new file mode 100644
index 000000000000..5a2d1235ead0
--- /dev/null
+++ b/fs/ext3/hash.c
@@ -0,0 +1,152 @@
1/*
2 * linux/fs/ext3/hash.c
3 *
4 * Copyright (C) 2002 by Theodore Ts'o
5 *
6 * This file is released under the GPL v2.
7 *
8 * This file may be redistributed under the terms of the GNU Public
9 * License.
10 */
11
12#include <linux/fs.h>
13#include <linux/jbd.h>
14#include <linux/sched.h>
15#include <linux/ext3_fs.h>
16#include <linux/cryptohash.h>
17
18#define DELTA 0x9E3779B9
19
20static void TEA_transform(__u32 buf[4], __u32 const in[])
21{
22 __u32 sum = 0;
23 __u32 b0 = buf[0], b1 = buf[1];
24 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
25 int n = 16;
26
27 do {
28 sum += DELTA;
29 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
30 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
31 } while(--n);
32
33 buf[0] += b0;
34 buf[1] += b1;
35}
36
37
38/* The old legacy hash */
39static __u32 dx_hack_hash (const char *name, int len)
40{
41 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
42 while (len--) {
43 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
44
45 if (hash & 0x80000000) hash -= 0x7fffffff;
46 hash1 = hash0;
47 hash0 = hash;
48 }
49 return (hash0 << 1);
50}
51
52static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
53{
54 __u32 pad, val;
55 int i;
56
57 pad = (__u32)len | ((__u32)len << 8);
58 pad |= pad << 16;
59
60 val = pad;
61 if (len > num*4)
62 len = num * 4;
63 for (i=0; i < len; i++) {
64 if ((i % 4) == 0)
65 val = pad;
66 val = msg[i] + (val << 8);
67 if ((i % 4) == 3) {
68 *buf++ = val;
69 val = pad;
70 num--;
71 }
72 }
73 if (--num >= 0)
74 *buf++ = val;
75 while (--num >= 0)
76 *buf++ = pad;
77}
78
79/*
80 * Returns the hash of a filename. If len is 0 and name is NULL, then
81 * this function can be used to test whether or not a hash version is
82 * supported.
83 *
84 * The seed is an 4 longword (32 bits) "secret" which can be used to
85 * uniquify a hash. If the seed is all zero's, then some default seed
86 * may be used.
87 *
88 * A particular hash version specifies whether or not the seed is
89 * represented, and whether or not the returned hash is 32 bits or 64
90 * bits. 32 bit hashes will return 0 for the minor hash.
91 */
92int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
93{
94 __u32 hash;
95 __u32 minor_hash = 0;
96 const char *p;
97 int i;
98 __u32 in[8], buf[4];
99
100 /* Initialize the default seed for the hash checksum functions */
101 buf[0] = 0x67452301;
102 buf[1] = 0xefcdab89;
103 buf[2] = 0x98badcfe;
104 buf[3] = 0x10325476;
105
106 /* Check to see if the seed is all zero's */
107 if (hinfo->seed) {
108 for (i=0; i < 4; i++) {
109 if (hinfo->seed[i])
110 break;
111 }
112 if (i < 4)
113 memcpy(buf, hinfo->seed, sizeof(buf));
114 }
115
116 switch (hinfo->hash_version) {
117 case DX_HASH_LEGACY:
118 hash = dx_hack_hash(name, len);
119 break;
120 case DX_HASH_HALF_MD4:
121 p = name;
122 while (len > 0) {
123 str2hashbuf(p, len, in, 8);
124 half_md4_transform(buf, in);
125 len -= 32;
126 p += 32;
127 }
128 minor_hash = buf[2];
129 hash = buf[1];
130 break;
131 case DX_HASH_TEA:
132 p = name;
133 while (len > 0) {
134 str2hashbuf(p, len, in, 4);
135 TEA_transform(buf, in);
136 len -= 16;
137 p += 16;
138 }
139 hash = buf[0];
140 minor_hash = buf[1];
141 break;
142 default:
143 hinfo->hash = 0;
144 return -1;
145 }
146 hash = hash & ~1;
147 if (hash == (EXT3_HTREE_EOF << 1))
148 hash = (EXT3_HTREE_EOF-1) << 1;
149 hinfo->hash = hash;
150 hinfo->minor_hash = minor_hash;
151 return 0;
152}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
new file mode 100644
index 000000000000..1e6f3ea28713
--- /dev/null
+++ b/fs/ext3/ialloc.c
@@ -0,0 +1,794 @@
1/*
2 * linux/fs/ext3/ialloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * BSD ufs-inspired inode and directory allocation by
10 * Stephen Tweedie (sct@redhat.com), 1993
11 * Big-endian to little-endian byte-swapping/bitmaps by
12 * David S. Miller (davem@caip.rutgers.edu), 1995
13 */
14
15#include <linux/time.h>
16#include <linux/fs.h>
17#include <linux/jbd.h>
18#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h>
20#include <linux/stat.h>
21#include <linux/string.h>
22#include <linux/quotaops.h>
23#include <linux/buffer_head.h>
24#include <linux/random.h>
25#include <linux/bitops.h>
26
27#include <asm/byteorder.h>
28
29#include "xattr.h"
30#include "acl.h"
31
32/*
33 * ialloc.c contains the inodes allocation and deallocation routines
34 */
35
36/*
37 * The free inodes are managed by bitmaps. A file system contains several
38 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
39 * block for inodes, N blocks for the inode table and data blocks.
40 *
41 * The file system contains group descriptors which are located after the
42 * super block. Each descriptor contains the number of the bitmap block and
43 * the free blocks count in the block.
44 */
45
46
47/*
48 * Read the inode allocation bitmap for a given block_group, reading
49 * into the specified slot in the superblock's bitmap cache.
50 *
51 * Return buffer_head of bitmap on success or NULL.
52 */
53static struct buffer_head *
54read_inode_bitmap(struct super_block * sb, unsigned long block_group)
55{
56 struct ext3_group_desc *desc;
57 struct buffer_head *bh = NULL;
58
59 desc = ext3_get_group_desc(sb, block_group, NULL);
60 if (!desc)
61 goto error_out;
62
63 bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
64 if (!bh)
65 ext3_error(sb, "read_inode_bitmap",
66 "Cannot read inode bitmap - "
67 "block_group = %lu, inode_bitmap = %u",
68 block_group, le32_to_cpu(desc->bg_inode_bitmap));
69error_out:
70 return bh;
71}
72
73/*
74 * NOTE! When we get the inode, we're the only people
75 * that have access to it, and as such there are no
76 * race conditions we have to worry about. The inode
77 * is not on the hash-lists, and it cannot be reached
78 * through the filesystem because the directory entry
79 * has been deleted earlier.
80 *
81 * HOWEVER: we must make sure that we get no aliases,
82 * which means that we have to call "clear_inode()"
83 * _before_ we mark the inode not in use in the inode
84 * bitmaps. Otherwise a newly created file might use
85 * the same inode number (not actually the same pointer
86 * though), and then we'd have two inodes sharing the
87 * same inode number and space on the harddisk.
88 */
89void ext3_free_inode (handle_t *handle, struct inode * inode)
90{
91 struct super_block * sb = inode->i_sb;
92 int is_directory;
93 unsigned long ino;
94 struct buffer_head *bitmap_bh = NULL;
95 struct buffer_head *bh2;
96 unsigned long block_group;
97 unsigned long bit;
98 struct ext3_group_desc * gdp;
99 struct ext3_super_block * es;
100 struct ext3_sb_info *sbi;
101 int fatal = 0, err;
102
103 if (atomic_read(&inode->i_count) > 1) {
104 printk ("ext3_free_inode: inode has count=%d\n",
105 atomic_read(&inode->i_count));
106 return;
107 }
108 if (inode->i_nlink) {
109 printk ("ext3_free_inode: inode has nlink=%d\n",
110 inode->i_nlink);
111 return;
112 }
113 if (!sb) {
114 printk("ext3_free_inode: inode on nonexistent device\n");
115 return;
116 }
117 sbi = EXT3_SB(sb);
118
119 ino = inode->i_ino;
120 ext3_debug ("freeing inode %lu\n", ino);
121
122 /*
123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well.
125 */
126 DQUOT_INIT(inode);
127 ext3_xattr_delete_inode(handle, inode);
128 DQUOT_FREE_INODE(inode);
129 DQUOT_DROP(inode);
130
131 is_directory = S_ISDIR(inode->i_mode);
132
133 /* Do this BEFORE marking the inode not in use or returning an error */
134 clear_inode (inode);
135
136 es = EXT3_SB(sb)->s_es;
137 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
138 ext3_error (sb, "ext3_free_inode",
139 "reserved or nonexistent inode %lu", ino);
140 goto error_return;
141 }
142 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
143 bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
144 bitmap_bh = read_inode_bitmap(sb, block_group);
145 if (!bitmap_bh)
146 goto error_return;
147
148 BUFFER_TRACE(bitmap_bh, "get_write_access");
149 fatal = ext3_journal_get_write_access(handle, bitmap_bh);
150 if (fatal)
151 goto error_return;
152
153 /* Ok, now we can actually update the inode bitmaps.. */
154 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
155 bit, bitmap_bh->b_data))
156 ext3_error (sb, "ext3_free_inode",
157 "bit already cleared for inode %lu", ino);
158 else {
159 gdp = ext3_get_group_desc (sb, block_group, &bh2);
160
161 BUFFER_TRACE(bh2, "get_write_access");
162 fatal = ext3_journal_get_write_access(handle, bh2);
163 if (fatal) goto error_return;
164
165 if (gdp) {
166 spin_lock(sb_bgl_lock(sbi, block_group));
167 gdp->bg_free_inodes_count = cpu_to_le16(
168 le16_to_cpu(gdp->bg_free_inodes_count) + 1);
169 if (is_directory)
170 gdp->bg_used_dirs_count = cpu_to_le16(
171 le16_to_cpu(gdp->bg_used_dirs_count) - 1);
172 spin_unlock(sb_bgl_lock(sbi, block_group));
173 percpu_counter_inc(&sbi->s_freeinodes_counter);
174 if (is_directory)
175 percpu_counter_dec(&sbi->s_dirs_counter);
176
177 }
178 BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
179 err = ext3_journal_dirty_metadata(handle, bh2);
180 if (!fatal) fatal = err;
181 }
182 BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
183 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
184 if (!fatal)
185 fatal = err;
186 sb->s_dirt = 1;
187error_return:
188 brelse(bitmap_bh);
189 ext3_std_error(sb, fatal);
190}
191
192/*
193 * There are two policies for allocating an inode. If the new inode is
194 * a directory, then a forward search is made for a block group with both
195 * free space and a low directory-to-inode ratio; if that fails, then of
196 * the groups with above-average free space, that group with the fewest
197 * directories already is chosen.
198 *
199 * For other inodes, search forward from the parent directory\'s block
200 * group to find a free inode.
201 */
202static int find_group_dir(struct super_block *sb, struct inode *parent)
203{
204 int ngroups = EXT3_SB(sb)->s_groups_count;
205 int freei, avefreei;
206 struct ext3_group_desc *desc, *best_desc = NULL;
207 struct buffer_head *bh;
208 int group, best_group = -1;
209
210 freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
211 avefreei = freei / ngroups;
212
213 for (group = 0; group < ngroups; group++) {
214 desc = ext3_get_group_desc (sb, group, &bh);
215 if (!desc || !desc->bg_free_inodes_count)
216 continue;
217 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
218 continue;
219 if (!best_desc ||
220 (le16_to_cpu(desc->bg_free_blocks_count) >
221 le16_to_cpu(best_desc->bg_free_blocks_count))) {
222 best_group = group;
223 best_desc = desc;
224 }
225 }
226 return best_group;
227}
228
229/*
230 * Orlov's allocator for directories.
231 *
232 * We always try to spread first-level directories.
233 *
234 * If there are blockgroups with both free inodes and free blocks counts
235 * not worse than average we return one with smallest directory count.
236 * Otherwise we simply return a random group.
237 *
238 * For the rest rules look so:
239 *
240 * It's OK to put directory into a group unless
241 * it has too many directories already (max_dirs) or
242 * it has too few free inodes left (min_inodes) or
243 * it has too few free blocks left (min_blocks) or
244 * it's already running too large debt (max_debt).
245 * Parent's group is prefered, if it doesn't satisfy these
246 * conditions we search cyclically through the rest. If none
247 * of the groups look good we just look for a group with more
248 * free inodes than average (starting at parent's group).
249 *
250 * Debt is incremented each time we allocate a directory and decremented
251 * when we allocate an inode, within 0--255.
252 */
253
254#define INODE_COST 64
255#define BLOCK_COST 256
256
257static int find_group_orlov(struct super_block *sb, struct inode *parent)
258{
259 int parent_group = EXT3_I(parent)->i_block_group;
260 struct ext3_sb_info *sbi = EXT3_SB(sb);
261 struct ext3_super_block *es = sbi->s_es;
262 int ngroups = sbi->s_groups_count;
263 int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
264 int freei, avefreei;
265 int freeb, avefreeb;
266 int blocks_per_dir, ndirs;
267 int max_debt, max_dirs, min_blocks, min_inodes;
268 int group = -1, i;
269 struct ext3_group_desc *desc;
270 struct buffer_head *bh;
271
272 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
273 avefreei = freei / ngroups;
274 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
275 avefreeb = freeb / ngroups;
276 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
277
278 if ((parent == sb->s_root->d_inode) ||
279 (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
280 int best_ndir = inodes_per_group;
281 int best_group = -1;
282
283 get_random_bytes(&group, sizeof(group));
284 parent_group = (unsigned)group % ngroups;
285 for (i = 0; i < ngroups; i++) {
286 group = (parent_group + i) % ngroups;
287 desc = ext3_get_group_desc (sb, group, &bh);
288 if (!desc || !desc->bg_free_inodes_count)
289 continue;
290 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
291 continue;
292 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
293 continue;
294 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
295 continue;
296 best_group = group;
297 best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
298 }
299 if (best_group >= 0)
300 return best_group;
301 goto fallback;
302 }
303
304 blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs;
305
306 max_dirs = ndirs / ngroups + inodes_per_group / 16;
307 min_inodes = avefreei - inodes_per_group / 4;
308 min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
309
310 max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST);
311 if (max_debt * INODE_COST > inodes_per_group)
312 max_debt = inodes_per_group / INODE_COST;
313 if (max_debt > 255)
314 max_debt = 255;
315 if (max_debt == 0)
316 max_debt = 1;
317
318 for (i = 0; i < ngroups; i++) {
319 group = (parent_group + i) % ngroups;
320 desc = ext3_get_group_desc (sb, group, &bh);
321 if (!desc || !desc->bg_free_inodes_count)
322 continue;
323 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
324 continue;
325 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
326 continue;
327 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
328 continue;
329 return group;
330 }
331
332fallback:
333 for (i = 0; i < ngroups; i++) {
334 group = (parent_group + i) % ngroups;
335 desc = ext3_get_group_desc (sb, group, &bh);
336 if (!desc || !desc->bg_free_inodes_count)
337 continue;
338 if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
339 return group;
340 }
341
342 if (avefreei) {
343 /*
344 * The free-inodes counter is approximate, and for really small
345 * filesystems the above test can fail to find any blockgroups
346 */
347 avefreei = 0;
348 goto fallback;
349 }
350
351 return -1;
352}
353
354static int find_group_other(struct super_block *sb, struct inode *parent)
355{
356 int parent_group = EXT3_I(parent)->i_block_group;
357 int ngroups = EXT3_SB(sb)->s_groups_count;
358 struct ext3_group_desc *desc;
359 struct buffer_head *bh;
360 int group, i;
361
362 /*
363 * Try to place the inode in its parent directory
364 */
365 group = parent_group;
366 desc = ext3_get_group_desc (sb, group, &bh);
367 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
368 le16_to_cpu(desc->bg_free_blocks_count))
369 return group;
370
371 /*
372 * We're going to place this inode in a different blockgroup from its
373 * parent. We want to cause files in a common directory to all land in
374 * the same blockgroup. But we want files which are in a different
375 * directory which shares a blockgroup with our parent to land in a
376 * different blockgroup.
377 *
378 * So add our directory's i_ino into the starting point for the hash.
379 */
380 group = (group + parent->i_ino) % ngroups;
381
382 /*
383 * Use a quadratic hash to find a group with a free inode and some free
384 * blocks.
385 */
386 for (i = 1; i < ngroups; i <<= 1) {
387 group += i;
388 if (group >= ngroups)
389 group -= ngroups;
390 desc = ext3_get_group_desc (sb, group, &bh);
391 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
392 le16_to_cpu(desc->bg_free_blocks_count))
393 return group;
394 }
395
396 /*
397 * That failed: try linear search for a free inode, even if that group
398 * has no free blocks.
399 */
400 group = parent_group;
401 for (i = 0; i < ngroups; i++) {
402 if (++group >= ngroups)
403 group = 0;
404 desc = ext3_get_group_desc (sb, group, &bh);
405 if (desc && le16_to_cpu(desc->bg_free_inodes_count))
406 return group;
407 }
408
409 return -1;
410}
411
412/*
413 * There are two policies for allocating an inode. If the new inode is
414 * a directory, then a forward search is made for a block group with both
415 * free space and a low directory-to-inode ratio; if that fails, then of
416 * the groups with above-average free space, that group with the fewest
417 * directories already is chosen.
418 *
419 * For other inodes, search forward from the parent directory's block
420 * group to find a free inode.
421 */
422struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
423{
424 struct super_block *sb;
425 struct buffer_head *bitmap_bh = NULL;
426 struct buffer_head *bh2;
427 int group;
428 unsigned long ino = 0;
429 struct inode * inode;
430 struct ext3_group_desc * gdp = NULL;
431 struct ext3_super_block * es;
432 struct ext3_inode_info *ei;
433 struct ext3_sb_info *sbi;
434 int err = 0;
435 struct inode *ret;
436 int i;
437
438 /* Cannot create files in a deleted directory */
439 if (!dir || !dir->i_nlink)
440 return ERR_PTR(-EPERM);
441
442 sb = dir->i_sb;
443 inode = new_inode(sb);
444 if (!inode)
445 return ERR_PTR(-ENOMEM);
446 ei = EXT3_I(inode);
447
448 sbi = EXT3_SB(sb);
449 es = sbi->s_es;
450 if (S_ISDIR(mode)) {
451 if (test_opt (sb, OLDALLOC))
452 group = find_group_dir(sb, dir);
453 else
454 group = find_group_orlov(sb, dir);
455 } else
456 group = find_group_other(sb, dir);
457
458 err = -ENOSPC;
459 if (group == -1)
460 goto out;
461
462 for (i = 0; i < sbi->s_groups_count; i++) {
463 err = -EIO;
464
465 gdp = ext3_get_group_desc(sb, group, &bh2);
466 if (!gdp)
467 goto fail;
468
469 brelse(bitmap_bh);
470 bitmap_bh = read_inode_bitmap(sb, group);
471 if (!bitmap_bh)
472 goto fail;
473
474 ino = 0;
475
476repeat_in_this_group:
477 ino = ext3_find_next_zero_bit((unsigned long *)
478 bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
479 if (ino < EXT3_INODES_PER_GROUP(sb)) {
480
481 BUFFER_TRACE(bitmap_bh, "get_write_access");
482 err = ext3_journal_get_write_access(handle, bitmap_bh);
483 if (err)
484 goto fail;
485
486 if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
487 ino, bitmap_bh->b_data)) {
488 /* we won it */
489 BUFFER_TRACE(bitmap_bh,
490 "call ext3_journal_dirty_metadata");
491 err = ext3_journal_dirty_metadata(handle,
492 bitmap_bh);
493 if (err)
494 goto fail;
495 goto got;
496 }
497 /* we lost it */
498 journal_release_buffer(handle, bitmap_bh);
499
500 if (++ino < EXT3_INODES_PER_GROUP(sb))
501 goto repeat_in_this_group;
502 }
503
504 /*
505 * This case is possible in concurrent environment. It is very
506 * rare. We cannot repeat the find_group_xxx() call because
507 * that will simply return the same blockgroup, because the
508 * group descriptor metadata has not yet been updated.
509 * So we just go onto the next blockgroup.
510 */
511 if (++group == sbi->s_groups_count)
512 group = 0;
513 }
514 err = -ENOSPC;
515 goto out;
516
517got:
518 ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
519 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
520 ext3_error (sb, "ext3_new_inode",
521 "reserved inode or inode > inodes count - "
522 "block_group = %d, inode=%lu", group, ino);
523 err = -EIO;
524 goto fail;
525 }
526
527 BUFFER_TRACE(bh2, "get_write_access");
528 err = ext3_journal_get_write_access(handle, bh2);
529 if (err) goto fail;
530 spin_lock(sb_bgl_lock(sbi, group));
531 gdp->bg_free_inodes_count =
532 cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
533 if (S_ISDIR(mode)) {
534 gdp->bg_used_dirs_count =
535 cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
536 }
537 spin_unlock(sb_bgl_lock(sbi, group));
538 BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
539 err = ext3_journal_dirty_metadata(handle, bh2);
540 if (err) goto fail;
541
542 percpu_counter_dec(&sbi->s_freeinodes_counter);
543 if (S_ISDIR(mode))
544 percpu_counter_inc(&sbi->s_dirs_counter);
545 sb->s_dirt = 1;
546
547 inode->i_uid = current->fsuid;
548 if (test_opt (sb, GRPID))
549 inode->i_gid = dir->i_gid;
550 else if (dir->i_mode & S_ISGID) {
551 inode->i_gid = dir->i_gid;
552 if (S_ISDIR(mode))
553 mode |= S_ISGID;
554 } else
555 inode->i_gid = current->fsgid;
556 inode->i_mode = mode;
557
558 inode->i_ino = ino;
559 /* This is the optimal IO size (for stat), not the fs block size */
560 inode->i_blksize = PAGE_SIZE;
561 inode->i_blocks = 0;
562 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
563
564 memset(ei->i_data, 0, sizeof(ei->i_data));
565 ei->i_dir_start_lookup = 0;
566 ei->i_disksize = 0;
567
568 ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
569 if (S_ISLNK(mode))
570 ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
571 /* dirsync only applies to directories */
572 if (!S_ISDIR(mode))
573 ei->i_flags &= ~EXT3_DIRSYNC_FL;
574#ifdef EXT3_FRAGMENTS
575 ei->i_faddr = 0;
576 ei->i_frag_no = 0;
577 ei->i_frag_size = 0;
578#endif
579 ei->i_file_acl = 0;
580 ei->i_dir_acl = 0;
581 ei->i_dtime = 0;
582 ei->i_block_alloc_info = NULL;
583 ei->i_block_group = group;
584
585 ext3_set_inode_flags(inode);
586 if (IS_DIRSYNC(inode))
587 handle->h_sync = 1;
588 insert_inode_hash(inode);
589 spin_lock(&sbi->s_next_gen_lock);
590 inode->i_generation = sbi->s_next_generation++;
591 spin_unlock(&sbi->s_next_gen_lock);
592
593 ei->i_state = EXT3_STATE_NEW;
594 ei->i_extra_isize =
595 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
596 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
597
598 ret = inode;
599 if(DQUOT_ALLOC_INODE(inode)) {
600 DQUOT_DROP(inode);
601 err = -EDQUOT;
602 goto fail2;
603 }
604 err = ext3_init_acl(handle, inode, dir);
605 if (err) {
606 DQUOT_FREE_INODE(inode);
607 goto fail2;
608 }
609 err = ext3_mark_inode_dirty(handle, inode);
610 if (err) {
611 ext3_std_error(sb, err);
612 DQUOT_FREE_INODE(inode);
613 goto fail2;
614 }
615
616 ext3_debug("allocating inode %lu\n", inode->i_ino);
617 goto really_out;
618fail:
619 ext3_std_error(sb, err);
620out:
621 iput(inode);
622 ret = ERR_PTR(err);
623really_out:
624 brelse(bitmap_bh);
625 return ret;
626
627fail2:
628 inode->i_flags |= S_NOQUOTA;
629 inode->i_nlink = 0;
630 iput(inode);
631 brelse(bitmap_bh);
632 return ERR_PTR(err);
633}
634
635/* Verify that we are loading a valid orphan from disk */
636struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
637{
638 unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
639 unsigned long block_group;
640 int bit;
641 struct buffer_head *bitmap_bh = NULL;
642 struct inode *inode = NULL;
643
644 /* Error cases - e2fsck has already cleaned up for us */
645 if (ino > max_ino) {
646 ext3_warning(sb, __FUNCTION__,
647 "bad orphan ino %lu! e2fsck was run?\n", ino);
648 goto out;
649 }
650
651 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
652 bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
653 bitmap_bh = read_inode_bitmap(sb, block_group);
654 if (!bitmap_bh) {
655 ext3_warning(sb, __FUNCTION__,
656 "inode bitmap error for orphan %lu\n", ino);
657 goto out;
658 }
659
660 /* Having the inode bit set should be a 100% indicator that this
661 * is a valid orphan (no e2fsck run on fs). Orphans also include
662 * inodes that were being truncated, so we can't check i_nlink==0.
663 */
664 if (!ext3_test_bit(bit, bitmap_bh->b_data) ||
665 !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
666 NEXT_ORPHAN(inode) > max_ino) {
667 ext3_warning(sb, __FUNCTION__,
668 "bad orphan inode %lu! e2fsck was run?\n", ino);
669 printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
670 bit, (unsigned long long)bitmap_bh->b_blocknr,
671 ext3_test_bit(bit, bitmap_bh->b_data));
672 printk(KERN_NOTICE "inode=%p\n", inode);
673 if (inode) {
674 printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
675 is_bad_inode(inode));
676 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
677 NEXT_ORPHAN(inode));
678 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
679 }
680 /* Avoid freeing blocks if we got a bad deleted inode */
681 if (inode && inode->i_nlink == 0)
682 inode->i_blocks = 0;
683 iput(inode);
684 inode = NULL;
685 }
686out:
687 brelse(bitmap_bh);
688 return inode;
689}
690
691unsigned long ext3_count_free_inodes (struct super_block * sb)
692{
693 unsigned long desc_count;
694 struct ext3_group_desc *gdp;
695 int i;
696#ifdef EXT3FS_DEBUG
697 struct ext3_super_block *es;
698 unsigned long bitmap_count, x;
699 struct buffer_head *bitmap_bh = NULL;
700
701 lock_super (sb);
702 es = EXT3_SB(sb)->s_es;
703 desc_count = 0;
704 bitmap_count = 0;
705 gdp = NULL;
706 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
707 gdp = ext3_get_group_desc (sb, i, NULL);
708 if (!gdp)
709 continue;
710 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
711 brelse(bitmap_bh);
712 bitmap_bh = read_inode_bitmap(sb, i);
713 if (!bitmap_bh)
714 continue;
715
716 x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8);
717 printk("group %d: stored = %d, counted = %lu\n",
718 i, le16_to_cpu(gdp->bg_free_inodes_count), x);
719 bitmap_count += x;
720 }
721 brelse(bitmap_bh);
722 printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
723 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
724 unlock_super(sb);
725 return desc_count;
726#else
727 desc_count = 0;
728 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
729 gdp = ext3_get_group_desc (sb, i, NULL);
730 if (!gdp)
731 continue;
732 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
733 cond_resched();
734 }
735 return desc_count;
736#endif
737}
738
739/* Called at mount-time, super-block is locked */
740unsigned long ext3_count_dirs (struct super_block * sb)
741{
742 unsigned long count = 0;
743 int i;
744
745 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
746 struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL);
747 if (!gdp)
748 continue;
749 count += le16_to_cpu(gdp->bg_used_dirs_count);
750 }
751 return count;
752}
753
754#ifdef CONFIG_EXT3_CHECK
755/* Called at mount-time, super-block is locked */
756void ext3_check_inodes_bitmap (struct super_block * sb)
757{
758 struct ext3_super_block * es;
759 unsigned long desc_count, bitmap_count, x;
760 struct buffer_head *bitmap_bh = NULL;
761 struct ext3_group_desc * gdp;
762 int i;
763
764 es = EXT3_SB(sb)->s_es;
765 desc_count = 0;
766 bitmap_count = 0;
767 gdp = NULL;
768 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
769 gdp = ext3_get_group_desc (sb, i, NULL);
770 if (!gdp)
771 continue;
772 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
773 brelse(bitmap_bh);
774 bitmap_bh = read_inode_bitmap(sb, i);
775 if (!bitmap_bh)
776 continue;
777
778 x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8);
779 if (le16_to_cpu(gdp->bg_free_inodes_count) != x)
780 ext3_error (sb, "ext3_check_inodes_bitmap",
781 "Wrong free inodes count in group %d, "
782 "stored = %d, counted = %lu", i,
783 le16_to_cpu(gdp->bg_free_inodes_count), x);
784 bitmap_count += x;
785 }
786 brelse(bitmap_bh);
787 if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count)
788 ext3_error (sb, "ext3_check_inodes_bitmap",
789 "Wrong free inodes count in super block, "
790 "stored = %lu, counted = %lu",
791 (unsigned long)le32_to_cpu(es->s_free_inodes_count),
792 bitmap_count);
793}
794#endif
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
new file mode 100644
index 000000000000..040eb288bb1c
--- /dev/null
+++ b/fs/ext3/inode.c
@@ -0,0 +1,3132 @@
1/*
2 * linux/fs/ext3/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23 */
24
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/time.h>
28#include <linux/ext3_jbd.h>
29#include <linux/jbd.h>
30#include <linux/smp_lock.h>
31#include <linux/highuid.h>
32#include <linux/pagemap.h>
33#include <linux/quotaops.h>
34#include <linux/string.h>
35#include <linux/buffer_head.h>
36#include <linux/writeback.h>
37#include <linux/mpage.h>
38#include <linux/uio.h>
39#include "xattr.h"
40#include "acl.h"
41
42static int ext3_writepage_trans_blocks(struct inode *inode);
43
44/*
45 * Test whether an inode is a fast symlink.
46 */
47static inline int ext3_inode_is_fast_symlink(struct inode *inode)
48{
49 int ea_blocks = EXT3_I(inode)->i_file_acl ?
50 (inode->i_sb->s_blocksize >> 9) : 0;
51
52 return (S_ISLNK(inode->i_mode) &&
53 inode->i_blocks - ea_blocks == 0);
54}
55
56/* The ext3 forget function must perform a revoke if we are freeing data
57 * which has been journaled. Metadata (eg. indirect blocks) must be
58 * revoked in all cases.
59 *
60 * "bh" may be NULL: a metadata block may have been freed from memory
61 * but there may still be a record of it in the journal, and that record
62 * still needs to be revoked.
63 */
64
65int ext3_forget(handle_t *handle, int is_metadata,
66 struct inode *inode, struct buffer_head *bh,
67 int blocknr)
68{
69 int err;
70
71 might_sleep();
72
73 BUFFER_TRACE(bh, "enter");
74
75 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
76 "data mode %lx\n",
77 bh, is_metadata, inode->i_mode,
78 test_opt(inode->i_sb, DATA_FLAGS));
79
80 /* Never use the revoke function if we are doing full data
81 * journaling: there is no need to, and a V1 superblock won't
82 * support it. Otherwise, only skip the revoke on un-journaled
83 * data blocks. */
84
85 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
86 (!is_metadata && !ext3_should_journal_data(inode))) {
87 if (bh) {
88 BUFFER_TRACE(bh, "call journal_forget");
89 return ext3_journal_forget(handle, bh);
90 }
91 return 0;
92 }
93
94 /*
95 * data!=journal && (is_metadata || should_journal_data(inode))
96 */
97 BUFFER_TRACE(bh, "call ext3_journal_revoke");
98 err = ext3_journal_revoke(handle, blocknr, bh);
99 if (err)
100 ext3_abort(inode->i_sb, __FUNCTION__,
101 "error %d when attempting revoke", err);
102 BUFFER_TRACE(bh, "exit");
103 return err;
104}
105
106/*
107 * Work out how many blocks we need to progress with the next chunk of a
108 * truncate transaction.
109 */
110
111static unsigned long blocks_for_truncate(struct inode *inode)
112{
113 unsigned long needed;
114
115 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
116
117 /* Give ourselves just enough room to cope with inodes in which
118 * i_blocks is corrupt: we've seen disk corruptions in the past
119 * which resulted in random data in an inode which looked enough
120 * like a regular file for ext3 to try to delete it. Things
121 * will go a bit crazy if that happens, but at least we should
122 * try not to panic the whole kernel. */
123 if (needed < 2)
124 needed = 2;
125
126 /* But we need to bound the transaction so we don't overflow the
127 * journal. */
128 if (needed > EXT3_MAX_TRANS_DATA)
129 needed = EXT3_MAX_TRANS_DATA;
130
131 return EXT3_DATA_TRANS_BLOCKS + needed;
132}
133
134/*
135 * Truncate transactions can be complex and absolutely huge. So we need to
136 * be able to restart the transaction at a conventient checkpoint to make
137 * sure we don't overflow the journal.
138 *
139 * start_transaction gets us a new handle for a truncate transaction,
140 * and extend_transaction tries to extend the existing one a bit. If
141 * extend fails, we need to propagate the failure up and restart the
142 * transaction in the top-level truncate loop. --sct
143 */
144
145static handle_t *start_transaction(struct inode *inode)
146{
147 handle_t *result;
148
149 result = ext3_journal_start(inode, blocks_for_truncate(inode));
150 if (!IS_ERR(result))
151 return result;
152
153 ext3_std_error(inode->i_sb, PTR_ERR(result));
154 return result;
155}
156
157/*
158 * Try to extend this transaction for the purposes of truncation.
159 *
160 * Returns 0 if we managed to create more room. If we can't create more
161 * room, and the transaction must be restarted we return 1.
162 */
163static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
164{
165 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
166 return 0;
167 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
168 return 0;
169 return 1;
170}
171
172/*
173 * Restart the transaction associated with *handle. This does a commit,
174 * so before we call here everything must be consistently dirtied against
175 * this transaction.
176 */
177static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
178{
179 jbd_debug(2, "restarting handle %p\n", handle);
180 return ext3_journal_restart(handle, blocks_for_truncate(inode));
181}
182
183/*
184 * Called at the last iput() if i_nlink is zero.
185 */
186void ext3_delete_inode (struct inode * inode)
187{
188 handle_t *handle;
189
190 if (is_bad_inode(inode))
191 goto no_delete;
192
193 handle = start_transaction(inode);
194 if (IS_ERR(handle)) {
195 /* If we're going to skip the normal cleanup, we still
196 * need to make sure that the in-core orphan linked list
197 * is properly cleaned up. */
198 ext3_orphan_del(NULL, inode);
199 goto no_delete;
200 }
201
202 if (IS_SYNC(inode))
203 handle->h_sync = 1;
204 inode->i_size = 0;
205 if (inode->i_blocks)
206 ext3_truncate(inode);
207 /*
208 * Kill off the orphan record which ext3_truncate created.
209 * AKPM: I think this can be inside the above `if'.
210 * Note that ext3_orphan_del() has to be able to cope with the
211 * deletion of a non-existent orphan - this is because we don't
212 * know if ext3_truncate() actually created an orphan record.
213 * (Well, we could do this if we need to, but heck - it works)
214 */
215 ext3_orphan_del(handle, inode);
216 EXT3_I(inode)->i_dtime = get_seconds();
217
218 /*
219 * One subtle ordering requirement: if anything has gone wrong
220 * (transaction abort, IO errors, whatever), then we can still
221 * do these next steps (the fs will already have been marked as
222 * having errors), but we can't free the inode if the mark_dirty
223 * fails.
224 */
225 if (ext3_mark_inode_dirty(handle, inode))
226 /* If that failed, just do the required in-core inode clear. */
227 clear_inode(inode);
228 else
229 ext3_free_inode(handle, inode);
230 ext3_journal_stop(handle);
231 return;
232no_delete:
233 clear_inode(inode); /* We must guarantee clearing of inode... */
234}
235
236static int ext3_alloc_block (handle_t *handle,
237 struct inode * inode, unsigned long goal, int *err)
238{
239 unsigned long result;
240
241 result = ext3_new_block(handle, inode, goal, err);
242 return result;
243}
244
245
246typedef struct {
247 __le32 *p;
248 __le32 key;
249 struct buffer_head *bh;
250} Indirect;
251
252static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
253{
254 p->key = *(p->p = v);
255 p->bh = bh;
256}
257
258static inline int verify_chain(Indirect *from, Indirect *to)
259{
260 while (from <= to && from->key == *from->p)
261 from++;
262 return (from > to);
263}
264
265/**
266 * ext3_block_to_path - parse the block number into array of offsets
267 * @inode: inode in question (we are only interested in its superblock)
268 * @i_block: block number to be parsed
269 * @offsets: array to store the offsets in
270 * @boundary: set this non-zero if the referred-to block is likely to be
271 * followed (on disk) by an indirect block.
272 *
273 * To store the locations of file's data ext3 uses a data structure common
274 * for UNIX filesystems - tree of pointers anchored in the inode, with
275 * data blocks at leaves and indirect blocks in intermediate nodes.
276 * This function translates the block number into path in that tree -
277 * return value is the path length and @offsets[n] is the offset of
278 * pointer to (n+1)th node in the nth one. If @block is out of range
279 * (negative or too large) warning is printed and zero returned.
280 *
281 * Note: function doesn't find node addresses, so no IO is needed. All
282 * we need to know is the capacity of indirect blocks (taken from the
283 * inode->i_sb).
284 */
285
286/*
287 * Portability note: the last comparison (check that we fit into triple
288 * indirect block) is spelled differently, because otherwise on an
289 * architecture with 32-bit longs and 8Kb pages we might get into trouble
290 * if our filesystem had 8Kb blocks. We might use long long, but that would
291 * kill us on x86. Oh, well, at least the sign propagation does not matter -
292 * i_block would have to be negative in the very beginning, so we would not
293 * get there at all.
294 */
295
296static int ext3_block_to_path(struct inode *inode,
297 long i_block, int offsets[4], int *boundary)
298{
299 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
300 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
301 const long direct_blocks = EXT3_NDIR_BLOCKS,
302 indirect_blocks = ptrs,
303 double_blocks = (1 << (ptrs_bits * 2));
304 int n = 0;
305 int final = 0;
306
307 if (i_block < 0) {
308 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
309 } else if (i_block < direct_blocks) {
310 offsets[n++] = i_block;
311 final = direct_blocks;
312 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
313 offsets[n++] = EXT3_IND_BLOCK;
314 offsets[n++] = i_block;
315 final = ptrs;
316 } else if ((i_block -= indirect_blocks) < double_blocks) {
317 offsets[n++] = EXT3_DIND_BLOCK;
318 offsets[n++] = i_block >> ptrs_bits;
319 offsets[n++] = i_block & (ptrs - 1);
320 final = ptrs;
321 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
322 offsets[n++] = EXT3_TIND_BLOCK;
323 offsets[n++] = i_block >> (ptrs_bits * 2);
324 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
325 offsets[n++] = i_block & (ptrs - 1);
326 final = ptrs;
327 } else {
328 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
329 }
330 if (boundary)
331 *boundary = (i_block & (ptrs - 1)) == (final - 1);
332 return n;
333}
334
335/**
336 * ext3_get_branch - read the chain of indirect blocks leading to data
337 * @inode: inode in question
338 * @depth: depth of the chain (1 - direct pointer, etc.)
339 * @offsets: offsets of pointers in inode/indirect blocks
340 * @chain: place to store the result
341 * @err: here we store the error value
342 *
343 * Function fills the array of triples <key, p, bh> and returns %NULL
344 * if everything went OK or the pointer to the last filled triple
345 * (incomplete one) otherwise. Upon the return chain[i].key contains
346 * the number of (i+1)-th block in the chain (as it is stored in memory,
347 * i.e. little-endian 32-bit), chain[i].p contains the address of that
348 * number (it points into struct inode for i==0 and into the bh->b_data
349 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
350 * block for i>0 and NULL for i==0. In other words, it holds the block
351 * numbers of the chain, addresses they were taken from (and where we can
352 * verify that chain did not change) and buffer_heads hosting these
353 * numbers.
354 *
355 * Function stops when it stumbles upon zero pointer (absent block)
356 * (pointer to last triple returned, *@err == 0)
357 * or when it gets an IO error reading an indirect block
358 * (ditto, *@err == -EIO)
359 * or when it notices that chain had been changed while it was reading
360 * (ditto, *@err == -EAGAIN)
361 * or when it reads all @depth-1 indirect blocks successfully and finds
362 * the whole chain, all way to the data (returns %NULL, *err == 0).
363 */
364static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
365 Indirect chain[4], int *err)
366{
367 struct super_block *sb = inode->i_sb;
368 Indirect *p = chain;
369 struct buffer_head *bh;
370
371 *err = 0;
372 /* i_data is not going away, no lock needed */
373 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
374 if (!p->key)
375 goto no_block;
376 while (--depth) {
377 bh = sb_bread(sb, le32_to_cpu(p->key));
378 if (!bh)
379 goto failure;
380 /* Reader: pointers */
381 if (!verify_chain(chain, p))
382 goto changed;
383 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
384 /* Reader: end */
385 if (!p->key)
386 goto no_block;
387 }
388 return NULL;
389
390changed:
391 brelse(bh);
392 *err = -EAGAIN;
393 goto no_block;
394failure:
395 *err = -EIO;
396no_block:
397 return p;
398}
399
400/**
401 * ext3_find_near - find a place for allocation with sufficient locality
402 * @inode: owner
403 * @ind: descriptor of indirect block.
404 *
405 * This function returns the prefered place for block allocation.
406 * It is used when heuristic for sequential allocation fails.
407 * Rules are:
408 * + if there is a block to the left of our position - allocate near it.
409 * + if pointer will live in indirect block - allocate near that block.
410 * + if pointer will live in inode - allocate in the same
411 * cylinder group.
412 *
413 * In the latter case we colour the starting block by the callers PID to
414 * prevent it from clashing with concurrent allocations for a different inode
415 * in the same block group. The PID is used here so that functionally related
416 * files will be close-by on-disk.
417 *
418 * Caller must make sure that @ind is valid and will stay that way.
419 */
420
421static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
422{
423 struct ext3_inode_info *ei = EXT3_I(inode);
424 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
425 __le32 *p;
426 unsigned long bg_start;
427 unsigned long colour;
428
429 /* Try to find previous block */
430 for (p = ind->p - 1; p >= start; p--)
431 if (*p)
432 return le32_to_cpu(*p);
433
434 /* No such thing, so let's try location of indirect block */
435 if (ind->bh)
436 return ind->bh->b_blocknr;
437
438 /*
439 * It is going to be refered from inode itself? OK, just put it into
440 * the same cylinder group then.
441 */
442 bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
443 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
444 colour = (current->pid % 16) *
445 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
446 return bg_start + colour;
447}
448
449/**
450 * ext3_find_goal - find a prefered place for allocation.
451 * @inode: owner
452 * @block: block we want
453 * @chain: chain of indirect blocks
454 * @partial: pointer to the last triple within a chain
455 * @goal: place to store the result.
456 *
457 * Normally this function find the prefered place for block allocation,
458 * stores it in *@goal and returns zero. If the branch had been changed
459 * under us we return -EAGAIN.
460 */
461
462static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
463 Indirect *partial, unsigned long *goal)
464{
465 struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
466
467 /*
468 * try the heuristic for sequential allocation,
469 * failing that at least try to get decent locality.
470 */
471 if (block_i && (block == block_i->last_alloc_logical_block + 1)
472 && (block_i->last_alloc_physical_block != 0)) {
473 *goal = block_i->last_alloc_physical_block + 1;
474 return 0;
475 }
476
477 if (verify_chain(chain, partial)) {
478 *goal = ext3_find_near(inode, partial);
479 return 0;
480 }
481 return -EAGAIN;
482}
483
484/**
485 * ext3_alloc_branch - allocate and set up a chain of blocks.
486 * @inode: owner
487 * @num: depth of the chain (number of blocks to allocate)
488 * @offsets: offsets (in the blocks) to store the pointers to next.
489 * @branch: place to store the chain in.
490 *
491 * This function allocates @num blocks, zeroes out all but the last one,
492 * links them into chain and (if we are synchronous) writes them to disk.
493 * In other words, it prepares a branch that can be spliced onto the
494 * inode. It stores the information about that chain in the branch[], in
495 * the same format as ext3_get_branch() would do. We are calling it after
496 * we had read the existing part of chain and partial points to the last
497 * triple of that (one with zero ->key). Upon the exit we have the same
498 * picture as after the successful ext3_get_block(), excpet that in one
499 * place chain is disconnected - *branch->p is still zero (we did not
500 * set the last link), but branch->key contains the number that should
501 * be placed into *branch->p to fill that gap.
502 *
503 * If allocation fails we free all blocks we've allocated (and forget
504 * their buffer_heads) and return the error value the from failed
505 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
506 * as described above and return 0.
507 */
508
509static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
510 int num,
511 unsigned long goal,
512 int *offsets,
513 Indirect *branch)
514{
515 int blocksize = inode->i_sb->s_blocksize;
516 int n = 0, keys = 0;
517 int err = 0;
518 int i;
519 int parent = ext3_alloc_block(handle, inode, goal, &err);
520
521 branch[0].key = cpu_to_le32(parent);
522 if (parent) {
523 for (n = 1; n < num; n++) {
524 struct buffer_head *bh;
525 /* Allocate the next block */
526 int nr = ext3_alloc_block(handle, inode, parent, &err);
527 if (!nr)
528 break;
529 branch[n].key = cpu_to_le32(nr);
530 keys = n+1;
531
532 /*
533 * Get buffer_head for parent block, zero it out
534 * and set the pointer to new one, then send
535 * parent to disk.
536 */
537 bh = sb_getblk(inode->i_sb, parent);
538 branch[n].bh = bh;
539 lock_buffer(bh);
540 BUFFER_TRACE(bh, "call get_create_access");
541 err = ext3_journal_get_create_access(handle, bh);
542 if (err) {
543 unlock_buffer(bh);
544 brelse(bh);
545 break;
546 }
547
548 memset(bh->b_data, 0, blocksize);
549 branch[n].p = (__le32*) bh->b_data + offsets[n];
550 *branch[n].p = branch[n].key;
551 BUFFER_TRACE(bh, "marking uptodate");
552 set_buffer_uptodate(bh);
553 unlock_buffer(bh);
554
555 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
556 err = ext3_journal_dirty_metadata(handle, bh);
557 if (err)
558 break;
559
560 parent = nr;
561 }
562 }
563 if (n == num)
564 return 0;
565
566 /* Allocation failed, free what we already allocated */
567 for (i = 1; i < keys; i++) {
568 BUFFER_TRACE(branch[i].bh, "call journal_forget");
569 ext3_journal_forget(handle, branch[i].bh);
570 }
571 for (i = 0; i < keys; i++)
572 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
573 return err;
574}
575
576/**
577 * ext3_splice_branch - splice the allocated branch onto inode.
578 * @inode: owner
579 * @block: (logical) number of block we are adding
580 * @chain: chain of indirect blocks (with a missing link - see
581 * ext3_alloc_branch)
582 * @where: location of missing link
583 * @num: number of blocks we are adding
584 *
585 * This function verifies that chain (up to the missing link) had not
586 * changed, fills the missing link and does all housekeeping needed in
587 * inode (->i_blocks, etc.). In case of success we end up with the full
588 * chain to new block and return 0. Otherwise (== chain had been changed)
589 * we free the new blocks (forgetting their buffer_heads, indeed) and
590 * return -EAGAIN.
591 */
592
593static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
594 Indirect chain[4], Indirect *where, int num)
595{
596 int i;
597 int err = 0;
598 struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
599
600 /*
601 * If we're splicing into a [td]indirect block (as opposed to the
602 * inode) then we need to get write access to the [td]indirect block
603 * before the splice.
604 */
605 if (where->bh) {
606 BUFFER_TRACE(where->bh, "get_write_access");
607 err = ext3_journal_get_write_access(handle, where->bh);
608 if (err)
609 goto err_out;
610 }
611 /* Verify that place we are splicing to is still there and vacant */
612
613 if (!verify_chain(chain, where-1) || *where->p)
614 /* Writer: end */
615 goto changed;
616
617 /* That's it */
618
619 *where->p = where->key;
620
621 /*
622 * update the most recently allocated logical & physical block
623 * in i_block_alloc_info, to assist find the proper goal block for next
624 * allocation
625 */
626 if (block_i) {
627 block_i->last_alloc_logical_block = block;
628 block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key);
629 }
630
631 /* We are done with atomic stuff, now do the rest of housekeeping */
632
633 inode->i_ctime = CURRENT_TIME_SEC;
634 ext3_mark_inode_dirty(handle, inode);
635
636 /* had we spliced it onto indirect block? */
637 if (where->bh) {
638 /*
639 * akpm: If we spliced it onto an indirect block, we haven't
640 * altered the inode. Note however that if it is being spliced
641 * onto an indirect block at the very end of the file (the
642 * file is growing) then we *will* alter the inode to reflect
643 * the new i_size. But that is not done here - it is done in
644 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
645 */
646 jbd_debug(5, "splicing indirect only\n");
647 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
648 err = ext3_journal_dirty_metadata(handle, where->bh);
649 if (err)
650 goto err_out;
651 } else {
652 /*
653 * OK, we spliced it into the inode itself on a direct block.
654 * Inode was dirtied above.
655 */
656 jbd_debug(5, "splicing direct\n");
657 }
658 return err;
659
660changed:
661 /*
662 * AKPM: if where[i].bh isn't part of the current updating
663 * transaction then we explode nastily. Test this code path.
664 */
665 jbd_debug(1, "the chain changed: try again\n");
666 err = -EAGAIN;
667
668err_out:
669 for (i = 1; i < num; i++) {
670 BUFFER_TRACE(where[i].bh, "call journal_forget");
671 ext3_journal_forget(handle, where[i].bh);
672 }
673 /* For the normal collision cleanup case, we free up the blocks.
674 * On genuine filesystem errors we don't even think about doing
675 * that. */
676 if (err == -EAGAIN)
677 for (i = 0; i < num; i++)
678 ext3_free_blocks(handle, inode,
679 le32_to_cpu(where[i].key), 1);
680 return err;
681}
682
683/*
684 * Allocation strategy is simple: if we have to allocate something, we will
685 * have to go the whole way to leaf. So let's do it before attaching anything
686 * to tree, set linkage between the newborn blocks, write them if sync is
687 * required, recheck the path, free and repeat if check fails, otherwise
688 * set the last missing link (that will protect us from any truncate-generated
689 * removals - all blocks on the path are immune now) and possibly force the
690 * write on the parent block.
691 * That has a nice additional property: no special recovery from the failed
692 * allocations is needed - we simply release blocks and do not touch anything
693 * reachable from inode.
694 *
695 * akpm: `handle' can be NULL if create == 0.
696 *
697 * The BKL may not be held on entry here. Be sure to take it early.
698 */
699
700static int
701ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
702 struct buffer_head *bh_result, int create, int extend_disksize)
703{
704 int err = -EIO;
705 int offsets[4];
706 Indirect chain[4];
707 Indirect *partial;
708 unsigned long goal;
709 int left;
710 int boundary = 0;
711 int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
712 struct ext3_inode_info *ei = EXT3_I(inode);
713
714 J_ASSERT(handle != NULL || create == 0);
715
716 if (depth == 0)
717 goto out;
718
719reread:
720 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
721
722 /* Simplest case - block found, no allocation needed */
723 if (!partial) {
724 clear_buffer_new(bh_result);
725got_it:
726 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
727 if (boundary)
728 set_buffer_boundary(bh_result);
729 /* Clean up and exit */
730 partial = chain+depth-1; /* the whole chain */
731 goto cleanup;
732 }
733
734 /* Next simple case - plain lookup or failed read of indirect block */
735 if (!create || err == -EIO) {
736cleanup:
737 while (partial > chain) {
738 BUFFER_TRACE(partial->bh, "call brelse");
739 brelse(partial->bh);
740 partial--;
741 }
742 BUFFER_TRACE(bh_result, "returned");
743out:
744 return err;
745 }
746
747 /*
748 * Indirect block might be removed by truncate while we were
749 * reading it. Handling of that case (forget what we've got and
750 * reread) is taken out of the main path.
751 */
752 if (err == -EAGAIN)
753 goto changed;
754
755 goal = 0;
756 down(&ei->truncate_sem);
757
758 /* lazy initialize the block allocation info here if necessary */
759 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) {
760 ext3_init_block_alloc_info(inode);
761 }
762
763 if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) {
764 up(&ei->truncate_sem);
765 goto changed;
766 }
767
768 left = (chain + depth) - partial;
769
770 /*
771 * Block out ext3_truncate while we alter the tree
772 */
773 err = ext3_alloc_branch(handle, inode, left, goal,
774 offsets+(partial-chain), partial);
775
776 /* The ext3_splice_branch call will free and forget any buffers
777 * on the new chain if there is a failure, but that risks using
778 * up transaction credits, especially for bitmaps where the
779 * credits cannot be returned. Can we handle this somehow? We
780 * may need to return -EAGAIN upwards in the worst case. --sct */
781 if (!err)
782 err = ext3_splice_branch(handle, inode, iblock, chain,
783 partial, left);
784 /* i_disksize growing is protected by truncate_sem
785 * don't forget to protect it if you're about to implement
786 * concurrent ext3_get_block() -bzzz */
787 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
788 ei->i_disksize = inode->i_size;
789 up(&ei->truncate_sem);
790 if (err == -EAGAIN)
791 goto changed;
792 if (err)
793 goto cleanup;
794
795 set_buffer_new(bh_result);
796 goto got_it;
797
798changed:
799 while (partial > chain) {
800 jbd_debug(1, "buffer chain changed, retrying\n");
801 BUFFER_TRACE(partial->bh, "brelsing");
802 brelse(partial->bh);
803 partial--;
804 }
805 goto reread;
806}
807
808static int ext3_get_block(struct inode *inode, sector_t iblock,
809 struct buffer_head *bh_result, int create)
810{
811 handle_t *handle = NULL;
812 int ret;
813
814 if (create) {
815 handle = ext3_journal_current_handle();
816 J_ASSERT(handle != 0);
817 }
818 ret = ext3_get_block_handle(handle, inode, iblock,
819 bh_result, create, 1);
820 return ret;
821}
822
823#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
824
825static int
826ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
827 unsigned long max_blocks, struct buffer_head *bh_result,
828 int create)
829{
830 handle_t *handle = journal_current_handle();
831 int ret = 0;
832
833 if (!handle)
834 goto get_block; /* A read */
835
836 if (handle->h_transaction->t_state == T_LOCKED) {
837 /*
838 * Huge direct-io writes can hold off commits for long
839 * periods of time. Let this commit run.
840 */
841 ext3_journal_stop(handle);
842 handle = ext3_journal_start(inode, DIO_CREDITS);
843 if (IS_ERR(handle))
844 ret = PTR_ERR(handle);
845 goto get_block;
846 }
847
848 if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
849 /*
850 * Getting low on buffer credits...
851 */
852 ret = ext3_journal_extend(handle, DIO_CREDITS);
853 if (ret > 0) {
854 /*
855 * Couldn't extend the transaction. Start a new one.
856 */
857 ret = ext3_journal_restart(handle, DIO_CREDITS);
858 }
859 }
860
861get_block:
862 if (ret == 0)
863 ret = ext3_get_block_handle(handle, inode, iblock,
864 bh_result, create, 0);
865 bh_result->b_size = (1 << inode->i_blkbits);
866 return ret;
867}
868
869static int ext3_writepages_get_block(struct inode *inode, sector_t iblock,
870 struct buffer_head *bh, int create)
871{
872 return ext3_direct_io_get_blocks(inode, iblock, 1, bh, create);
873}
874
875/*
876 * `handle' can be NULL if create is zero
877 */
878struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
879 long block, int create, int * errp)
880{
881 struct buffer_head dummy;
882 int fatal = 0, err;
883
884 J_ASSERT(handle != NULL || create == 0);
885
886 dummy.b_state = 0;
887 dummy.b_blocknr = -1000;
888 buffer_trace_init(&dummy.b_history);
889 *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
890 if (!*errp && buffer_mapped(&dummy)) {
891 struct buffer_head *bh;
892 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
893 if (buffer_new(&dummy)) {
894 J_ASSERT(create != 0);
895 J_ASSERT(handle != 0);
896
897 /* Now that we do not always journal data, we
898 should keep in mind whether this should
899 always journal the new buffer as metadata.
900 For now, regular file writes use
901 ext3_get_block instead, so it's not a
902 problem. */
903 lock_buffer(bh);
904 BUFFER_TRACE(bh, "call get_create_access");
905 fatal = ext3_journal_get_create_access(handle, bh);
906 if (!fatal && !buffer_uptodate(bh)) {
907 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
908 set_buffer_uptodate(bh);
909 }
910 unlock_buffer(bh);
911 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
912 err = ext3_journal_dirty_metadata(handle, bh);
913 if (!fatal)
914 fatal = err;
915 } else {
916 BUFFER_TRACE(bh, "not a new buffer");
917 }
918 if (fatal) {
919 *errp = fatal;
920 brelse(bh);
921 bh = NULL;
922 }
923 return bh;
924 }
925 return NULL;
926}
927
928struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
929 int block, int create, int *err)
930{
931 struct buffer_head * bh;
932
933 bh = ext3_getblk(handle, inode, block, create, err);
934 if (!bh)
935 return bh;
936 if (buffer_uptodate(bh))
937 return bh;
938 ll_rw_block(READ, 1, &bh);
939 wait_on_buffer(bh);
940 if (buffer_uptodate(bh))
941 return bh;
942 put_bh(bh);
943 *err = -EIO;
944 return NULL;
945}
946
947static int walk_page_buffers( handle_t *handle,
948 struct buffer_head *head,
949 unsigned from,
950 unsigned to,
951 int *partial,
952 int (*fn)( handle_t *handle,
953 struct buffer_head *bh))
954{
955 struct buffer_head *bh;
956 unsigned block_start, block_end;
957 unsigned blocksize = head->b_size;
958 int err, ret = 0;
959 struct buffer_head *next;
960
961 for ( bh = head, block_start = 0;
962 ret == 0 && (bh != head || !block_start);
963 block_start = block_end, bh = next)
964 {
965 next = bh->b_this_page;
966 block_end = block_start + blocksize;
967 if (block_end <= from || block_start >= to) {
968 if (partial && !buffer_uptodate(bh))
969 *partial = 1;
970 continue;
971 }
972 err = (*fn)(handle, bh);
973 if (!ret)
974 ret = err;
975 }
976 return ret;
977}
978
979/*
980 * To preserve ordering, it is essential that the hole instantiation and
981 * the data write be encapsulated in a single transaction. We cannot
982 * close off a transaction and start a new one between the ext3_get_block()
983 * and the commit_write(). So doing the journal_start at the start of
984 * prepare_write() is the right place.
985 *
986 * Also, this function can nest inside ext3_writepage() ->
987 * block_write_full_page(). In that case, we *know* that ext3_writepage()
988 * has generated enough buffer credits to do the whole page. So we won't
989 * block on the journal in that case, which is good, because the caller may
990 * be PF_MEMALLOC.
991 *
992 * By accident, ext3 can be reentered when a transaction is open via
993 * quota file writes. If we were to commit the transaction while thus
994 * reentered, there can be a deadlock - we would be holding a quota
995 * lock, and the commit would never complete if another thread had a
996 * transaction open and was blocking on the quota lock - a ranking
997 * violation.
998 *
999 * So what we do is to rely on the fact that journal_stop/journal_start
1000 * will _not_ run commit under these circumstances because handle->h_ref
1001 * is elevated. We'll still have enough credits for the tiny quotafile
1002 * write.
1003 */
1004
1005static int do_journal_get_write_access(handle_t *handle,
1006 struct buffer_head *bh)
1007{
1008 if (!buffer_mapped(bh) || buffer_freed(bh))
1009 return 0;
1010 return ext3_journal_get_write_access(handle, bh);
1011}
1012
1013static int ext3_prepare_write(struct file *file, struct page *page,
1014 unsigned from, unsigned to)
1015{
1016 struct inode *inode = page->mapping->host;
1017 int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1018 handle_t *handle;
1019 int retries = 0;
1020
1021retry:
1022 handle = ext3_journal_start(inode, needed_blocks);
1023 if (IS_ERR(handle)) {
1024 ret = PTR_ERR(handle);
1025 goto out;
1026 }
1027 if (test_opt(inode->i_sb, NOBH))
1028 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1029 else
1030 ret = block_prepare_write(page, from, to, ext3_get_block);
1031 if (ret)
1032 goto prepare_write_failed;
1033
1034 if (ext3_should_journal_data(inode)) {
1035 ret = walk_page_buffers(handle, page_buffers(page),
1036 from, to, NULL, do_journal_get_write_access);
1037 }
1038prepare_write_failed:
1039 if (ret)
1040 ext3_journal_stop(handle);
1041 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1042 goto retry;
1043out:
1044 return ret;
1045}
1046
1047int
1048ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1049{
1050 int err = journal_dirty_data(handle, bh);
1051 if (err)
1052 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1053 bh, handle,err);
1054 return err;
1055}
1056
1057/* For commit_write() in data=journal mode */
1058static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1059{
1060 if (!buffer_mapped(bh) || buffer_freed(bh))
1061 return 0;
1062 set_buffer_uptodate(bh);
1063 return ext3_journal_dirty_metadata(handle, bh);
1064}
1065
1066/*
1067 * We need to pick up the new inode size which generic_commit_write gave us
1068 * `file' can be NULL - eg, when called from page_symlink().
1069 *
1070 * ext3 never places buffers on inode->i_mapping->private_list. metadata
1071 * buffers are managed internally.
1072 */
1073
1074static int ext3_ordered_commit_write(struct file *file, struct page *page,
1075 unsigned from, unsigned to)
1076{
1077 handle_t *handle = ext3_journal_current_handle();
1078 struct inode *inode = page->mapping->host;
1079 int ret = 0, ret2;
1080
1081 ret = walk_page_buffers(handle, page_buffers(page),
1082 from, to, NULL, ext3_journal_dirty_data);
1083
1084 if (ret == 0) {
1085 /*
1086 * generic_commit_write() will run mark_inode_dirty() if i_size
1087 * changes. So let's piggyback the i_disksize mark_inode_dirty
1088 * into that.
1089 */
1090 loff_t new_i_size;
1091
1092 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1093 if (new_i_size > EXT3_I(inode)->i_disksize)
1094 EXT3_I(inode)->i_disksize = new_i_size;
1095 ret = generic_commit_write(file, page, from, to);
1096 }
1097 ret2 = ext3_journal_stop(handle);
1098 if (!ret)
1099 ret = ret2;
1100 return ret;
1101}
1102
1103static int ext3_writeback_commit_write(struct file *file, struct page *page,
1104 unsigned from, unsigned to)
1105{
1106 handle_t *handle = ext3_journal_current_handle();
1107 struct inode *inode = page->mapping->host;
1108 int ret = 0, ret2;
1109 loff_t new_i_size;
1110
1111 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1112 if (new_i_size > EXT3_I(inode)->i_disksize)
1113 EXT3_I(inode)->i_disksize = new_i_size;
1114
1115 if (test_opt(inode->i_sb, NOBH))
1116 ret = nobh_commit_write(file, page, from, to);
1117 else
1118 ret = generic_commit_write(file, page, from, to);
1119
1120 ret2 = ext3_journal_stop(handle);
1121 if (!ret)
1122 ret = ret2;
1123 return ret;
1124}
1125
1126static int ext3_journalled_commit_write(struct file *file,
1127 struct page *page, unsigned from, unsigned to)
1128{
1129 handle_t *handle = ext3_journal_current_handle();
1130 struct inode *inode = page->mapping->host;
1131 int ret = 0, ret2;
1132 int partial = 0;
1133 loff_t pos;
1134
1135 /*
1136 * Here we duplicate the generic_commit_write() functionality
1137 */
1138 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1139
1140 ret = walk_page_buffers(handle, page_buffers(page), from,
1141 to, &partial, commit_write_fn);
1142 if (!partial)
1143 SetPageUptodate(page);
1144 if (pos > inode->i_size)
1145 i_size_write(inode, pos);
1146 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1147 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1148 EXT3_I(inode)->i_disksize = inode->i_size;
1149 ret2 = ext3_mark_inode_dirty(handle, inode);
1150 if (!ret)
1151 ret = ret2;
1152 }
1153 ret2 = ext3_journal_stop(handle);
1154 if (!ret)
1155 ret = ret2;
1156 return ret;
1157}
1158
1159/*
1160 * bmap() is special. It gets used by applications such as lilo and by
1161 * the swapper to find the on-disk block of a specific piece of data.
1162 *
1163 * Naturally, this is dangerous if the block concerned is still in the
1164 * journal. If somebody makes a swapfile on an ext3 data-journaling
1165 * filesystem and enables swap, then they may get a nasty shock when the
1166 * data getting swapped to that swapfile suddenly gets overwritten by
1167 * the original zero's written out previously to the journal and
1168 * awaiting writeback in the kernel's buffer cache.
1169 *
1170 * So, if we see any bmap calls here on a modified, data-journaled file,
1171 * take extra steps to flush any blocks which might be in the cache.
1172 */
1173static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1174{
1175 struct inode *inode = mapping->host;
1176 journal_t *journal;
1177 int err;
1178
1179 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1180 /*
1181 * This is a REALLY heavyweight approach, but the use of
1182 * bmap on dirty files is expected to be extremely rare:
1183 * only if we run lilo or swapon on a freshly made file
1184 * do we expect this to happen.
1185 *
1186 * (bmap requires CAP_SYS_RAWIO so this does not
1187 * represent an unprivileged user DOS attack --- we'd be
1188 * in trouble if mortal users could trigger this path at
1189 * will.)
1190 *
1191 * NB. EXT3_STATE_JDATA is not set on files other than
1192 * regular files. If somebody wants to bmap a directory
1193 * or symlink and gets confused because the buffer
1194 * hasn't yet been flushed to disk, they deserve
1195 * everything they get.
1196 */
1197
1198 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1199 journal = EXT3_JOURNAL(inode);
1200 journal_lock_updates(journal);
1201 err = journal_flush(journal);
1202 journal_unlock_updates(journal);
1203
1204 if (err)
1205 return 0;
1206 }
1207
1208 return generic_block_bmap(mapping,block,ext3_get_block);
1209}
1210
1211static int bget_one(handle_t *handle, struct buffer_head *bh)
1212{
1213 get_bh(bh);
1214 return 0;
1215}
1216
1217static int bput_one(handle_t *handle, struct buffer_head *bh)
1218{
1219 put_bh(bh);
1220 return 0;
1221}
1222
1223static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1224{
1225 if (buffer_mapped(bh))
1226 return ext3_journal_dirty_data(handle, bh);
1227 return 0;
1228}
1229
1230/*
1231 * Note that we always start a transaction even if we're not journalling
1232 * data. This is to preserve ordering: any hole instantiation within
1233 * __block_write_full_page -> ext3_get_block() should be journalled
1234 * along with the data so we don't crash and then get metadata which
1235 * refers to old data.
1236 *
1237 * In all journalling modes block_write_full_page() will start the I/O.
1238 *
1239 * Problem:
1240 *
1241 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1242 * ext3_writepage()
1243 *
1244 * Similar for:
1245 *
1246 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1247 *
1248 * Same applies to ext3_get_block(). We will deadlock on various things like
1249 * lock_journal and i_truncate_sem.
1250 *
1251 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1252 * allocations fail.
1253 *
1254 * 16May01: If we're reentered then journal_current_handle() will be
1255 * non-zero. We simply *return*.
1256 *
1257 * 1 July 2001: @@@ FIXME:
1258 * In journalled data mode, a data buffer may be metadata against the
1259 * current transaction. But the same file is part of a shared mapping
1260 * and someone does a writepage() on it.
1261 *
1262 * We will move the buffer onto the async_data list, but *after* it has
1263 * been dirtied. So there's a small window where we have dirty data on
1264 * BJ_Metadata.
1265 *
1266 * Note that this only applies to the last partial page in the file. The
1267 * bit which block_write_full_page() uses prepare/commit for. (That's
1268 * broken code anyway: it's wrong for msync()).
1269 *
1270 * It's a rare case: affects the final partial page, for journalled data
1271 * where the file is subject to bith write() and writepage() in the same
1272 * transction. To fix it we'll need a custom block_write_full_page().
1273 * We'll probably need that anyway for journalling writepage() output.
1274 *
1275 * We don't honour synchronous mounts for writepage(). That would be
1276 * disastrous. Any write() or metadata operation will sync the fs for
1277 * us.
1278 *
1279 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1280 * we don't need to open a transaction here.
1281 */
1282static int ext3_ordered_writepage(struct page *page,
1283 struct writeback_control *wbc)
1284{
1285 struct inode *inode = page->mapping->host;
1286 struct buffer_head *page_bufs;
1287 handle_t *handle = NULL;
1288 int ret = 0;
1289 int err;
1290
1291 J_ASSERT(PageLocked(page));
1292
1293 /*
1294 * We give up here if we're reentered, because it might be for a
1295 * different filesystem.
1296 */
1297 if (ext3_journal_current_handle())
1298 goto out_fail;
1299
1300 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1301
1302 if (IS_ERR(handle)) {
1303 ret = PTR_ERR(handle);
1304 goto out_fail;
1305 }
1306
1307 if (!page_has_buffers(page)) {
1308 create_empty_buffers(page, inode->i_sb->s_blocksize,
1309 (1 << BH_Dirty)|(1 << BH_Uptodate));
1310 }
1311 page_bufs = page_buffers(page);
1312 walk_page_buffers(handle, page_bufs, 0,
1313 PAGE_CACHE_SIZE, NULL, bget_one);
1314
1315 ret = block_write_full_page(page, ext3_get_block, wbc);
1316
1317 /*
1318 * The page can become unlocked at any point now, and
1319 * truncate can then come in and change things. So we
1320 * can't touch *page from now on. But *page_bufs is
1321 * safe due to elevated refcount.
1322 */
1323
1324 /*
1325 * And attach them to the current transaction. But only if
1326 * block_write_full_page() succeeded. Otherwise they are unmapped,
1327 * and generally junk.
1328 */
1329 if (ret == 0) {
1330 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1331 NULL, journal_dirty_data_fn);
1332 if (!ret)
1333 ret = err;
1334 }
1335 walk_page_buffers(handle, page_bufs, 0,
1336 PAGE_CACHE_SIZE, NULL, bput_one);
1337 err = ext3_journal_stop(handle);
1338 if (!ret)
1339 ret = err;
1340 return ret;
1341
1342out_fail:
1343 redirty_page_for_writepage(wbc, page);
1344 unlock_page(page);
1345 return ret;
1346}
1347
1348static int
1349ext3_writeback_writepage_helper(struct page *page,
1350 struct writeback_control *wbc)
1351{
1352 return block_write_full_page(page, ext3_get_block, wbc);
1353}
1354
1355static int
1356ext3_writeback_writepages(struct address_space *mapping,
1357 struct writeback_control *wbc)
1358{
1359 struct inode *inode = mapping->host;
1360 handle_t *handle = NULL;
1361 int err, ret = 0;
1362
1363 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1364 return ret;
1365
1366 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1367 if (IS_ERR(handle)) {
1368 ret = PTR_ERR(handle);
1369 return ret;
1370 }
1371
1372 ret = __mpage_writepages(mapping, wbc, ext3_writepages_get_block,
1373 ext3_writeback_writepage_helper);
1374
1375 /*
1376 * Need to reaquire the handle since ext3_writepages_get_block()
1377 * can restart the handle
1378 */
1379 handle = journal_current_handle();
1380
1381 err = ext3_journal_stop(handle);
1382 if (!ret)
1383 ret = err;
1384 return ret;
1385}
1386
1387static int ext3_writeback_writepage(struct page *page,
1388 struct writeback_control *wbc)
1389{
1390 struct inode *inode = page->mapping->host;
1391 handle_t *handle = NULL;
1392 int ret = 0;
1393 int err;
1394
1395 if (ext3_journal_current_handle())
1396 goto out_fail;
1397
1398 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1399 if (IS_ERR(handle)) {
1400 ret = PTR_ERR(handle);
1401 goto out_fail;
1402 }
1403
1404 if (test_opt(inode->i_sb, NOBH))
1405 ret = nobh_writepage(page, ext3_get_block, wbc);
1406 else
1407 ret = block_write_full_page(page, ext3_get_block, wbc);
1408
1409 err = ext3_journal_stop(handle);
1410 if (!ret)
1411 ret = err;
1412 return ret;
1413
1414out_fail:
1415 redirty_page_for_writepage(wbc, page);
1416 unlock_page(page);
1417 return ret;
1418}
1419
1420static int ext3_journalled_writepage(struct page *page,
1421 struct writeback_control *wbc)
1422{
1423 struct inode *inode = page->mapping->host;
1424 handle_t *handle = NULL;
1425 int ret = 0;
1426 int err;
1427
1428 if (ext3_journal_current_handle())
1429 goto no_write;
1430
1431 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1432 if (IS_ERR(handle)) {
1433 ret = PTR_ERR(handle);
1434 goto no_write;
1435 }
1436
1437 if (!page_has_buffers(page) || PageChecked(page)) {
1438 /*
1439 * It's mmapped pagecache. Add buffers and journal it. There
1440 * doesn't seem much point in redirtying the page here.
1441 */
1442 ClearPageChecked(page);
1443 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1444 ext3_get_block);
1445 if (ret != 0)
1446 goto out_unlock;
1447 ret = walk_page_buffers(handle, page_buffers(page), 0,
1448 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1449
1450 err = walk_page_buffers(handle, page_buffers(page), 0,
1451 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1452 if (ret == 0)
1453 ret = err;
1454 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1455 unlock_page(page);
1456 } else {
1457 /*
1458 * It may be a page full of checkpoint-mode buffers. We don't
1459 * really know unless we go poke around in the buffer_heads.
1460 * But block_write_full_page will do the right thing.
1461 */
1462 ret = block_write_full_page(page, ext3_get_block, wbc);
1463 }
1464 err = ext3_journal_stop(handle);
1465 if (!ret)
1466 ret = err;
1467out:
1468 return ret;
1469
1470no_write:
1471 redirty_page_for_writepage(wbc, page);
1472out_unlock:
1473 unlock_page(page);
1474 goto out;
1475}
1476
1477static int ext3_readpage(struct file *file, struct page *page)
1478{
1479 return mpage_readpage(page, ext3_get_block);
1480}
1481
1482static int
1483ext3_readpages(struct file *file, struct address_space *mapping,
1484 struct list_head *pages, unsigned nr_pages)
1485{
1486 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1487}
1488
1489static int ext3_invalidatepage(struct page *page, unsigned long offset)
1490{
1491 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1492
1493 /*
1494 * If it's a full truncate we just forget about the pending dirtying
1495 */
1496 if (offset == 0)
1497 ClearPageChecked(page);
1498
1499 return journal_invalidatepage(journal, page, offset);
1500}
1501
1502static int ext3_releasepage(struct page *page, int wait)
1503{
1504 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1505
1506 WARN_ON(PageChecked(page));
1507 if (!page_has_buffers(page))
1508 return 0;
1509 return journal_try_to_free_buffers(journal, page, wait);
1510}
1511
1512/*
1513 * If the O_DIRECT write will extend the file then add this inode to the
1514 * orphan list. So recovery will truncate it back to the original size
1515 * if the machine crashes during the write.
1516 *
1517 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1518 * crashes then stale disk data _may_ be exposed inside the file.
1519 */
1520static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1521 const struct iovec *iov, loff_t offset,
1522 unsigned long nr_segs)
1523{
1524 struct file *file = iocb->ki_filp;
1525 struct inode *inode = file->f_mapping->host;
1526 struct ext3_inode_info *ei = EXT3_I(inode);
1527 handle_t *handle = NULL;
1528 ssize_t ret;
1529 int orphan = 0;
1530 size_t count = iov_length(iov, nr_segs);
1531
1532 if (rw == WRITE) {
1533 loff_t final_size = offset + count;
1534
1535 handle = ext3_journal_start(inode, DIO_CREDITS);
1536 if (IS_ERR(handle)) {
1537 ret = PTR_ERR(handle);
1538 goto out;
1539 }
1540 if (final_size > inode->i_size) {
1541 ret = ext3_orphan_add(handle, inode);
1542 if (ret)
1543 goto out_stop;
1544 orphan = 1;
1545 ei->i_disksize = inode->i_size;
1546 }
1547 }
1548
1549 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1550 offset, nr_segs,
1551 ext3_direct_io_get_blocks, NULL);
1552
1553 /*
1554 * Reacquire the handle: ext3_direct_io_get_block() can restart the
1555 * transaction
1556 */
1557 handle = journal_current_handle();
1558
1559out_stop:
1560 if (handle) {
1561 int err;
1562
1563 if (orphan && inode->i_nlink)
1564 ext3_orphan_del(handle, inode);
1565 if (orphan && ret > 0) {
1566 loff_t end = offset + ret;
1567 if (end > inode->i_size) {
1568 ei->i_disksize = end;
1569 i_size_write(inode, end);
1570 /*
1571 * We're going to return a positive `ret'
1572 * here due to non-zero-length I/O, so there's
1573 * no way of reporting error returns from
1574 * ext3_mark_inode_dirty() to userspace. So
1575 * ignore it.
1576 */
1577 ext3_mark_inode_dirty(handle, inode);
1578 }
1579 }
1580 err = ext3_journal_stop(handle);
1581 if (ret == 0)
1582 ret = err;
1583 }
1584out:
1585 return ret;
1586}
1587
1588/*
1589 * Pages can be marked dirty completely asynchronously from ext3's journalling
1590 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1591 * much here because ->set_page_dirty is called under VFS locks. The page is
1592 * not necessarily locked.
1593 *
1594 * We cannot just dirty the page and leave attached buffers clean, because the
1595 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1596 * or jbddirty because all the journalling code will explode.
1597 *
1598 * So what we do is to mark the page "pending dirty" and next time writepage
1599 * is called, propagate that into the buffers appropriately.
1600 */
1601static int ext3_journalled_set_page_dirty(struct page *page)
1602{
1603 SetPageChecked(page);
1604 return __set_page_dirty_nobuffers(page);
1605}
1606
1607static struct address_space_operations ext3_ordered_aops = {
1608 .readpage = ext3_readpage,
1609 .readpages = ext3_readpages,
1610 .writepage = ext3_ordered_writepage,
1611 .sync_page = block_sync_page,
1612 .prepare_write = ext3_prepare_write,
1613 .commit_write = ext3_ordered_commit_write,
1614 .bmap = ext3_bmap,
1615 .invalidatepage = ext3_invalidatepage,
1616 .releasepage = ext3_releasepage,
1617 .direct_IO = ext3_direct_IO,
1618};
1619
1620static struct address_space_operations ext3_writeback_aops = {
1621 .readpage = ext3_readpage,
1622 .readpages = ext3_readpages,
1623 .writepage = ext3_writeback_writepage,
1624 .writepages = ext3_writeback_writepages,
1625 .sync_page = block_sync_page,
1626 .prepare_write = ext3_prepare_write,
1627 .commit_write = ext3_writeback_commit_write,
1628 .bmap = ext3_bmap,
1629 .invalidatepage = ext3_invalidatepage,
1630 .releasepage = ext3_releasepage,
1631 .direct_IO = ext3_direct_IO,
1632};
1633
1634static struct address_space_operations ext3_journalled_aops = {
1635 .readpage = ext3_readpage,
1636 .readpages = ext3_readpages,
1637 .writepage = ext3_journalled_writepage,
1638 .sync_page = block_sync_page,
1639 .prepare_write = ext3_prepare_write,
1640 .commit_write = ext3_journalled_commit_write,
1641 .set_page_dirty = ext3_journalled_set_page_dirty,
1642 .bmap = ext3_bmap,
1643 .invalidatepage = ext3_invalidatepage,
1644 .releasepage = ext3_releasepage,
1645};
1646
1647void ext3_set_aops(struct inode *inode)
1648{
1649 if (ext3_should_order_data(inode))
1650 inode->i_mapping->a_ops = &ext3_ordered_aops;
1651 else if (ext3_should_writeback_data(inode))
1652 inode->i_mapping->a_ops = &ext3_writeback_aops;
1653 else
1654 inode->i_mapping->a_ops = &ext3_journalled_aops;
1655}
1656
1657/*
1658 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1659 * up to the end of the block which corresponds to `from'.
1660 * This required during truncate. We need to physically zero the tail end
1661 * of that block so it doesn't yield old data if the file is later grown.
1662 */
1663static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1664 struct address_space *mapping, loff_t from)
1665{
1666 unsigned long index = from >> PAGE_CACHE_SHIFT;
1667 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1668 unsigned blocksize, iblock, length, pos;
1669 struct inode *inode = mapping->host;
1670 struct buffer_head *bh;
1671 int err = 0;
1672 void *kaddr;
1673
1674 blocksize = inode->i_sb->s_blocksize;
1675 length = blocksize - (offset & (blocksize - 1));
1676 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1677
1678 /*
1679 * For "nobh" option, we can only work if we don't need to
1680 * read-in the page - otherwise we create buffers to do the IO.
1681 */
1682 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) {
1683 if (PageUptodate(page)) {
1684 kaddr = kmap_atomic(page, KM_USER0);
1685 memset(kaddr + offset, 0, length);
1686 flush_dcache_page(page);
1687 kunmap_atomic(kaddr, KM_USER0);
1688 set_page_dirty(page);
1689 goto unlock;
1690 }
1691 }
1692
1693 if (!page_has_buffers(page))
1694 create_empty_buffers(page, blocksize, 0);
1695
1696 /* Find the buffer that contains "offset" */
1697 bh = page_buffers(page);
1698 pos = blocksize;
1699 while (offset >= pos) {
1700 bh = bh->b_this_page;
1701 iblock++;
1702 pos += blocksize;
1703 }
1704
1705 err = 0;
1706 if (buffer_freed(bh)) {
1707 BUFFER_TRACE(bh, "freed: skip");
1708 goto unlock;
1709 }
1710
1711 if (!buffer_mapped(bh)) {
1712 BUFFER_TRACE(bh, "unmapped");
1713 ext3_get_block(inode, iblock, bh, 0);
1714 /* unmapped? It's a hole - nothing to do */
1715 if (!buffer_mapped(bh)) {
1716 BUFFER_TRACE(bh, "still unmapped");
1717 goto unlock;
1718 }
1719 }
1720
1721 /* Ok, it's mapped. Make sure it's up-to-date */
1722 if (PageUptodate(page))
1723 set_buffer_uptodate(bh);
1724
1725 if (!buffer_uptodate(bh)) {
1726 err = -EIO;
1727 ll_rw_block(READ, 1, &bh);
1728 wait_on_buffer(bh);
1729 /* Uhhuh. Read error. Complain and punt. */
1730 if (!buffer_uptodate(bh))
1731 goto unlock;
1732 }
1733
1734 if (ext3_should_journal_data(inode)) {
1735 BUFFER_TRACE(bh, "get write access");
1736 err = ext3_journal_get_write_access(handle, bh);
1737 if (err)
1738 goto unlock;
1739 }
1740
1741 kaddr = kmap_atomic(page, KM_USER0);
1742 memset(kaddr + offset, 0, length);
1743 flush_dcache_page(page);
1744 kunmap_atomic(kaddr, KM_USER0);
1745
1746 BUFFER_TRACE(bh, "zeroed end of block");
1747
1748 err = 0;
1749 if (ext3_should_journal_data(inode)) {
1750 err = ext3_journal_dirty_metadata(handle, bh);
1751 } else {
1752 if (ext3_should_order_data(inode))
1753 err = ext3_journal_dirty_data(handle, bh);
1754 mark_buffer_dirty(bh);
1755 }
1756
1757unlock:
1758 unlock_page(page);
1759 page_cache_release(page);
1760 return err;
1761}
1762
1763/*
1764 * Probably it should be a library function... search for first non-zero word
1765 * or memcmp with zero_page, whatever is better for particular architecture.
1766 * Linus?
1767 */
1768static inline int all_zeroes(__le32 *p, __le32 *q)
1769{
1770 while (p < q)
1771 if (*p++)
1772 return 0;
1773 return 1;
1774}
1775
1776/**
1777 * ext3_find_shared - find the indirect blocks for partial truncation.
1778 * @inode: inode in question
1779 * @depth: depth of the affected branch
1780 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1781 * @chain: place to store the pointers to partial indirect blocks
1782 * @top: place to the (detached) top of branch
1783 *
1784 * This is a helper function used by ext3_truncate().
1785 *
1786 * When we do truncate() we may have to clean the ends of several
1787 * indirect blocks but leave the blocks themselves alive. Block is
1788 * partially truncated if some data below the new i_size is refered
1789 * from it (and it is on the path to the first completely truncated
1790 * data block, indeed). We have to free the top of that path along
1791 * with everything to the right of the path. Since no allocation
1792 * past the truncation point is possible until ext3_truncate()
1793 * finishes, we may safely do the latter, but top of branch may
1794 * require special attention - pageout below the truncation point
1795 * might try to populate it.
1796 *
1797 * We atomically detach the top of branch from the tree, store the
1798 * block number of its root in *@top, pointers to buffer_heads of
1799 * partially truncated blocks - in @chain[].bh and pointers to
1800 * their last elements that should not be removed - in
1801 * @chain[].p. Return value is the pointer to last filled element
1802 * of @chain.
1803 *
1804 * The work left to caller to do the actual freeing of subtrees:
1805 * a) free the subtree starting from *@top
1806 * b) free the subtrees whose roots are stored in
1807 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1808 * c) free the subtrees growing from the inode past the @chain[0].
1809 * (no partially truncated stuff there). */
1810
1811static Indirect *ext3_find_shared(struct inode *inode,
1812 int depth,
1813 int offsets[4],
1814 Indirect chain[4],
1815 __le32 *top)
1816{
1817 Indirect *partial, *p;
1818 int k, err;
1819
1820 *top = 0;
1821 /* Make k index the deepest non-null offest + 1 */
1822 for (k = depth; k > 1 && !offsets[k-1]; k--)
1823 ;
1824 partial = ext3_get_branch(inode, k, offsets, chain, &err);
1825 /* Writer: pointers */
1826 if (!partial)
1827 partial = chain + k-1;
1828 /*
1829 * If the branch acquired continuation since we've looked at it -
1830 * fine, it should all survive and (new) top doesn't belong to us.
1831 */
1832 if (!partial->key && *partial->p)
1833 /* Writer: end */
1834 goto no_top;
1835 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1836 ;
1837 /*
1838 * OK, we've found the last block that must survive. The rest of our
1839 * branch should be detached before unlocking. However, if that rest
1840 * of branch is all ours and does not grow immediately from the inode
1841 * it's easier to cheat and just decrement partial->p.
1842 */
1843 if (p == chain + k - 1 && p > chain) {
1844 p->p--;
1845 } else {
1846 *top = *p->p;
1847 /* Nope, don't do this in ext3. Must leave the tree intact */
1848#if 0
1849 *p->p = 0;
1850#endif
1851 }
1852 /* Writer: end */
1853
1854 while(partial > p)
1855 {
1856 brelse(partial->bh);
1857 partial--;
1858 }
1859no_top:
1860 return partial;
1861}
1862
1863/*
1864 * Zero a number of block pointers in either an inode or an indirect block.
1865 * If we restart the transaction we must again get write access to the
1866 * indirect block for further modification.
1867 *
1868 * We release `count' blocks on disk, but (last - first) may be greater
1869 * than `count' because there can be holes in there.
1870 */
1871static void
1872ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1873 unsigned long block_to_free, unsigned long count,
1874 __le32 *first, __le32 *last)
1875{
1876 __le32 *p;
1877 if (try_to_extend_transaction(handle, inode)) {
1878 if (bh) {
1879 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1880 ext3_journal_dirty_metadata(handle, bh);
1881 }
1882 ext3_mark_inode_dirty(handle, inode);
1883 ext3_journal_test_restart(handle, inode);
1884 if (bh) {
1885 BUFFER_TRACE(bh, "retaking write access");
1886 ext3_journal_get_write_access(handle, bh);
1887 }
1888 }
1889
1890 /*
1891 * Any buffers which are on the journal will be in memory. We find
1892 * them on the hash table so journal_revoke() will run journal_forget()
1893 * on them. We've already detached each block from the file, so
1894 * bforget() in journal_forget() should be safe.
1895 *
1896 * AKPM: turn on bforget in journal_forget()!!!
1897 */
1898 for (p = first; p < last; p++) {
1899 u32 nr = le32_to_cpu(*p);
1900 if (nr) {
1901 struct buffer_head *bh;
1902
1903 *p = 0;
1904 bh = sb_find_get_block(inode->i_sb, nr);
1905 ext3_forget(handle, 0, inode, bh, nr);
1906 }
1907 }
1908
1909 ext3_free_blocks(handle, inode, block_to_free, count);
1910}
1911
1912/**
1913 * ext3_free_data - free a list of data blocks
1914 * @handle: handle for this transaction
1915 * @inode: inode we are dealing with
1916 * @this_bh: indirect buffer_head which contains *@first and *@last
1917 * @first: array of block numbers
1918 * @last: points immediately past the end of array
1919 *
1920 * We are freeing all blocks refered from that array (numbers are stored as
1921 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1922 *
1923 * We accumulate contiguous runs of blocks to free. Conveniently, if these
1924 * blocks are contiguous then releasing them at one time will only affect one
1925 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1926 * actually use a lot of journal space.
1927 *
1928 * @this_bh will be %NULL if @first and @last point into the inode's direct
1929 * block pointers.
1930 */
1931static void ext3_free_data(handle_t *handle, struct inode *inode,
1932 struct buffer_head *this_bh,
1933 __le32 *first, __le32 *last)
1934{
1935 unsigned long block_to_free = 0; /* Starting block # of a run */
1936 unsigned long count = 0; /* Number of blocks in the run */
1937 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
1938 corresponding to
1939 block_to_free */
1940 unsigned long nr; /* Current block # */
1941 __le32 *p; /* Pointer into inode/ind
1942 for current block */
1943 int err;
1944
1945 if (this_bh) { /* For indirect block */
1946 BUFFER_TRACE(this_bh, "get_write_access");
1947 err = ext3_journal_get_write_access(handle, this_bh);
1948 /* Important: if we can't update the indirect pointers
1949 * to the blocks, we can't free them. */
1950 if (err)
1951 return;
1952 }
1953
1954 for (p = first; p < last; p++) {
1955 nr = le32_to_cpu(*p);
1956 if (nr) {
1957 /* accumulate blocks to free if they're contiguous */
1958 if (count == 0) {
1959 block_to_free = nr;
1960 block_to_free_p = p;
1961 count = 1;
1962 } else if (nr == block_to_free + count) {
1963 count++;
1964 } else {
1965 ext3_clear_blocks(handle, inode, this_bh,
1966 block_to_free,
1967 count, block_to_free_p, p);
1968 block_to_free = nr;
1969 block_to_free_p = p;
1970 count = 1;
1971 }
1972 }
1973 }
1974
1975 if (count > 0)
1976 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1977 count, block_to_free_p, p);
1978
1979 if (this_bh) {
1980 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1981 ext3_journal_dirty_metadata(handle, this_bh);
1982 }
1983}
1984
1985/**
1986 * ext3_free_branches - free an array of branches
1987 * @handle: JBD handle for this transaction
1988 * @inode: inode we are dealing with
1989 * @parent_bh: the buffer_head which contains *@first and *@last
1990 * @first: array of block numbers
1991 * @last: pointer immediately past the end of array
1992 * @depth: depth of the branches to free
1993 *
1994 * We are freeing all blocks refered from these branches (numbers are
1995 * stored as little-endian 32-bit) and updating @inode->i_blocks
1996 * appropriately.
1997 */
1998static void ext3_free_branches(handle_t *handle, struct inode *inode,
1999 struct buffer_head *parent_bh,
2000 __le32 *first, __le32 *last, int depth)
2001{
2002 unsigned long nr;
2003 __le32 *p;
2004
2005 if (is_handle_aborted(handle))
2006 return;
2007
2008 if (depth--) {
2009 struct buffer_head *bh;
2010 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2011 p = last;
2012 while (--p >= first) {
2013 nr = le32_to_cpu(*p);
2014 if (!nr)
2015 continue; /* A hole */
2016
2017 /* Go read the buffer for the next level down */
2018 bh = sb_bread(inode->i_sb, nr);
2019
2020 /*
2021 * A read failure? Report error and clear slot
2022 * (should be rare).
2023 */
2024 if (!bh) {
2025 ext3_error(inode->i_sb, "ext3_free_branches",
2026 "Read failure, inode=%ld, block=%ld",
2027 inode->i_ino, nr);
2028 continue;
2029 }
2030
2031 /* This zaps the entire block. Bottom up. */
2032 BUFFER_TRACE(bh, "free child branches");
2033 ext3_free_branches(handle, inode, bh,
2034 (__le32*)bh->b_data,
2035 (__le32*)bh->b_data + addr_per_block,
2036 depth);
2037
2038 /*
2039 * We've probably journalled the indirect block several
2040 * times during the truncate. But it's no longer
2041 * needed and we now drop it from the transaction via
2042 * journal_revoke().
2043 *
2044 * That's easy if it's exclusively part of this
2045 * transaction. But if it's part of the committing
2046 * transaction then journal_forget() will simply
2047 * brelse() it. That means that if the underlying
2048 * block is reallocated in ext3_get_block(),
2049 * unmap_underlying_metadata() will find this block
2050 * and will try to get rid of it. damn, damn.
2051 *
2052 * If this block has already been committed to the
2053 * journal, a revoke record will be written. And
2054 * revoke records must be emitted *before* clearing
2055 * this block's bit in the bitmaps.
2056 */
2057 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2058
2059 /*
2060 * Everything below this this pointer has been
2061 * released. Now let this top-of-subtree go.
2062 *
2063 * We want the freeing of this indirect block to be
2064 * atomic in the journal with the updating of the
2065 * bitmap block which owns it. So make some room in
2066 * the journal.
2067 *
2068 * We zero the parent pointer *after* freeing its
2069 * pointee in the bitmaps, so if extend_transaction()
2070 * for some reason fails to put the bitmap changes and
2071 * the release into the same transaction, recovery
2072 * will merely complain about releasing a free block,
2073 * rather than leaking blocks.
2074 */
2075 if (is_handle_aborted(handle))
2076 return;
2077 if (try_to_extend_transaction(handle, inode)) {
2078 ext3_mark_inode_dirty(handle, inode);
2079 ext3_journal_test_restart(handle, inode);
2080 }
2081
2082 ext3_free_blocks(handle, inode, nr, 1);
2083
2084 if (parent_bh) {
2085 /*
2086 * The block which we have just freed is
2087 * pointed to by an indirect block: journal it
2088 */
2089 BUFFER_TRACE(parent_bh, "get_write_access");
2090 if (!ext3_journal_get_write_access(handle,
2091 parent_bh)){
2092 *p = 0;
2093 BUFFER_TRACE(parent_bh,
2094 "call ext3_journal_dirty_metadata");
2095 ext3_journal_dirty_metadata(handle,
2096 parent_bh);
2097 }
2098 }
2099 }
2100 } else {
2101 /* We have reached the bottom of the tree. */
2102 BUFFER_TRACE(parent_bh, "free data blocks");
2103 ext3_free_data(handle, inode, parent_bh, first, last);
2104 }
2105}
2106
2107/*
2108 * ext3_truncate()
2109 *
2110 * We block out ext3_get_block() block instantiations across the entire
2111 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2112 * simultaneously on behalf of the same inode.
2113 *
2114 * As we work through the truncate and commmit bits of it to the journal there
2115 * is one core, guiding principle: the file's tree must always be consistent on
2116 * disk. We must be able to restart the truncate after a crash.
2117 *
2118 * The file's tree may be transiently inconsistent in memory (although it
2119 * probably isn't), but whenever we close off and commit a journal transaction,
2120 * the contents of (the filesystem + the journal) must be consistent and
2121 * restartable. It's pretty simple, really: bottom up, right to left (although
2122 * left-to-right works OK too).
2123 *
2124 * Note that at recovery time, journal replay occurs *before* the restart of
2125 * truncate against the orphan inode list.
2126 *
2127 * The committed inode has the new, desired i_size (which is the same as
2128 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
2129 * that this inode's truncate did not complete and it will again call
2130 * ext3_truncate() to have another go. So there will be instantiated blocks
2131 * to the right of the truncation point in a crashed ext3 filesystem. But
2132 * that's fine - as long as they are linked from the inode, the post-crash
2133 * ext3_truncate() run will find them and release them.
2134 */
2135
2136void ext3_truncate(struct inode * inode)
2137{
2138 handle_t *handle;
2139 struct ext3_inode_info *ei = EXT3_I(inode);
2140 __le32 *i_data = ei->i_data;
2141 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2142 struct address_space *mapping = inode->i_mapping;
2143 int offsets[4];
2144 Indirect chain[4];
2145 Indirect *partial;
2146 __le32 nr = 0;
2147 int n;
2148 long last_block;
2149 unsigned blocksize = inode->i_sb->s_blocksize;
2150 struct page *page;
2151
2152 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2153 S_ISLNK(inode->i_mode)))
2154 return;
2155 if (ext3_inode_is_fast_symlink(inode))
2156 return;
2157 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2158 return;
2159
2160 /*
2161 * We have to lock the EOF page here, because lock_page() nests
2162 * outside journal_start().
2163 */
2164 if ((inode->i_size & (blocksize - 1)) == 0) {
2165 /* Block boundary? Nothing to do */
2166 page = NULL;
2167 } else {
2168 page = grab_cache_page(mapping,
2169 inode->i_size >> PAGE_CACHE_SHIFT);
2170 if (!page)
2171 return;
2172 }
2173
2174 handle = start_transaction(inode);
2175 if (IS_ERR(handle)) {
2176 if (page) {
2177 clear_highpage(page);
2178 flush_dcache_page(page);
2179 unlock_page(page);
2180 page_cache_release(page);
2181 }
2182 return; /* AKPM: return what? */
2183 }
2184
2185 last_block = (inode->i_size + blocksize-1)
2186 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2187
2188 if (page)
2189 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2190
2191 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2192 if (n == 0)
2193 goto out_stop; /* error */
2194
2195 /*
2196 * OK. This truncate is going to happen. We add the inode to the
2197 * orphan list, so that if this truncate spans multiple transactions,
2198 * and we crash, we will resume the truncate when the filesystem
2199 * recovers. It also marks the inode dirty, to catch the new size.
2200 *
2201 * Implication: the file must always be in a sane, consistent
2202 * truncatable state while each transaction commits.
2203 */
2204 if (ext3_orphan_add(handle, inode))
2205 goto out_stop;
2206
2207 /*
2208 * The orphan list entry will now protect us from any crash which
2209 * occurs before the truncate completes, so it is now safe to propagate
2210 * the new, shorter inode size (held for now in i_size) into the
2211 * on-disk inode. We do this via i_disksize, which is the value which
2212 * ext3 *really* writes onto the disk inode.
2213 */
2214 ei->i_disksize = inode->i_size;
2215
2216 /*
2217 * From here we block out all ext3_get_block() callers who want to
2218 * modify the block allocation tree.
2219 */
2220 down(&ei->truncate_sem);
2221
2222 if (n == 1) { /* direct blocks */
2223 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2224 i_data + EXT3_NDIR_BLOCKS);
2225 goto do_indirects;
2226 }
2227
2228 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2229 /* Kill the top of shared branch (not detached) */
2230 if (nr) {
2231 if (partial == chain) {
2232 /* Shared branch grows from the inode */
2233 ext3_free_branches(handle, inode, NULL,
2234 &nr, &nr+1, (chain+n-1) - partial);
2235 *partial->p = 0;
2236 /*
2237 * We mark the inode dirty prior to restart,
2238 * and prior to stop. No need for it here.
2239 */
2240 } else {
2241 /* Shared branch grows from an indirect block */
2242 BUFFER_TRACE(partial->bh, "get_write_access");
2243 ext3_free_branches(handle, inode, partial->bh,
2244 partial->p,
2245 partial->p+1, (chain+n-1) - partial);
2246 }
2247 }
2248 /* Clear the ends of indirect blocks on the shared branch */
2249 while (partial > chain) {
2250 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2251 (__le32*)partial->bh->b_data+addr_per_block,
2252 (chain+n-1) - partial);
2253 BUFFER_TRACE(partial->bh, "call brelse");
2254 brelse (partial->bh);
2255 partial--;
2256 }
2257do_indirects:
2258 /* Kill the remaining (whole) subtrees */
2259 switch (offsets[0]) {
2260 default:
2261 nr = i_data[EXT3_IND_BLOCK];
2262 if (nr) {
2263 ext3_free_branches(handle, inode, NULL,
2264 &nr, &nr+1, 1);
2265 i_data[EXT3_IND_BLOCK] = 0;
2266 }
2267 case EXT3_IND_BLOCK:
2268 nr = i_data[EXT3_DIND_BLOCK];
2269 if (nr) {
2270 ext3_free_branches(handle, inode, NULL,
2271 &nr, &nr+1, 2);
2272 i_data[EXT3_DIND_BLOCK] = 0;
2273 }
2274 case EXT3_DIND_BLOCK:
2275 nr = i_data[EXT3_TIND_BLOCK];
2276 if (nr) {
2277 ext3_free_branches(handle, inode, NULL,
2278 &nr, &nr+1, 3);
2279 i_data[EXT3_TIND_BLOCK] = 0;
2280 }
2281 case EXT3_TIND_BLOCK:
2282 ;
2283 }
2284
2285 ext3_discard_reservation(inode);
2286
2287 up(&ei->truncate_sem);
2288 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2289 ext3_mark_inode_dirty(handle, inode);
2290
2291 /* In a multi-transaction truncate, we only make the final
2292 * transaction synchronous */
2293 if (IS_SYNC(inode))
2294 handle->h_sync = 1;
2295out_stop:
2296 /*
2297 * If this was a simple ftruncate(), and the file will remain alive
2298 * then we need to clear up the orphan record which we created above.
2299 * However, if this was a real unlink then we were called by
2300 * ext3_delete_inode(), and we allow that function to clean up the
2301 * orphan info for us.
2302 */
2303 if (inode->i_nlink)
2304 ext3_orphan_del(handle, inode);
2305
2306 ext3_journal_stop(handle);
2307}
2308
2309static unsigned long ext3_get_inode_block(struct super_block *sb,
2310 unsigned long ino, struct ext3_iloc *iloc)
2311{
2312 unsigned long desc, group_desc, block_group;
2313 unsigned long offset, block;
2314 struct buffer_head *bh;
2315 struct ext3_group_desc * gdp;
2316
2317
2318 if ((ino != EXT3_ROOT_INO &&
2319 ino != EXT3_JOURNAL_INO &&
2320 ino != EXT3_RESIZE_INO &&
2321 ino < EXT3_FIRST_INO(sb)) ||
2322 ino > le32_to_cpu(
2323 EXT3_SB(sb)->s_es->s_inodes_count)) {
2324 ext3_error (sb, "ext3_get_inode_block",
2325 "bad inode number: %lu", ino);
2326 return 0;
2327 }
2328 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2329 if (block_group >= EXT3_SB(sb)->s_groups_count) {
2330 ext3_error (sb, "ext3_get_inode_block",
2331 "group >= groups count");
2332 return 0;
2333 }
2334 smp_rmb();
2335 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2336 desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2337 bh = EXT3_SB(sb)->s_group_desc[group_desc];
2338 if (!bh) {
2339 ext3_error (sb, "ext3_get_inode_block",
2340 "Descriptor not loaded");
2341 return 0;
2342 }
2343
2344 gdp = (struct ext3_group_desc *) bh->b_data;
2345 /*
2346 * Figure out the offset within the block group inode table
2347 */
2348 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2349 EXT3_INODE_SIZE(sb);
2350 block = le32_to_cpu(gdp[desc].bg_inode_table) +
2351 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2352
2353 iloc->block_group = block_group;
2354 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2355 return block;
2356}
2357
2358/*
2359 * ext3_get_inode_loc returns with an extra refcount against the inode's
2360 * underlying buffer_head on success. If 'in_mem' is true, we have all
2361 * data in memory that is needed to recreate the on-disk version of this
2362 * inode.
2363 */
2364static int __ext3_get_inode_loc(struct inode *inode,
2365 struct ext3_iloc *iloc, int in_mem)
2366{
2367 unsigned long block;
2368 struct buffer_head *bh;
2369
2370 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2371 if (!block)
2372 return -EIO;
2373
2374 bh = sb_getblk(inode->i_sb, block);
2375 if (!bh) {
2376 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2377 "unable to read inode block - "
2378 "inode=%lu, block=%lu", inode->i_ino, block);
2379 return -EIO;
2380 }
2381 if (!buffer_uptodate(bh)) {
2382 lock_buffer(bh);
2383 if (buffer_uptodate(bh)) {
2384 /* someone brought it uptodate while we waited */
2385 unlock_buffer(bh);
2386 goto has_buffer;
2387 }
2388
2389 /*
2390 * If we have all information of the inode in memory and this
2391 * is the only valid inode in the block, we need not read the
2392 * block.
2393 */
2394 if (in_mem) {
2395 struct buffer_head *bitmap_bh;
2396 struct ext3_group_desc *desc;
2397 int inodes_per_buffer;
2398 int inode_offset, i;
2399 int block_group;
2400 int start;
2401
2402 block_group = (inode->i_ino - 1) /
2403 EXT3_INODES_PER_GROUP(inode->i_sb);
2404 inodes_per_buffer = bh->b_size /
2405 EXT3_INODE_SIZE(inode->i_sb);
2406 inode_offset = ((inode->i_ino - 1) %
2407 EXT3_INODES_PER_GROUP(inode->i_sb));
2408 start = inode_offset & ~(inodes_per_buffer - 1);
2409
2410 /* Is the inode bitmap in cache? */
2411 desc = ext3_get_group_desc(inode->i_sb,
2412 block_group, NULL);
2413 if (!desc)
2414 goto make_io;
2415
2416 bitmap_bh = sb_getblk(inode->i_sb,
2417 le32_to_cpu(desc->bg_inode_bitmap));
2418 if (!bitmap_bh)
2419 goto make_io;
2420
2421 /*
2422 * If the inode bitmap isn't in cache then the
2423 * optimisation may end up performing two reads instead
2424 * of one, so skip it.
2425 */
2426 if (!buffer_uptodate(bitmap_bh)) {
2427 brelse(bitmap_bh);
2428 goto make_io;
2429 }
2430 for (i = start; i < start + inodes_per_buffer; i++) {
2431 if (i == inode_offset)
2432 continue;
2433 if (ext3_test_bit(i, bitmap_bh->b_data))
2434 break;
2435 }
2436 brelse(bitmap_bh);
2437 if (i == start + inodes_per_buffer) {
2438 /* all other inodes are free, so skip I/O */
2439 memset(bh->b_data, 0, bh->b_size);
2440 set_buffer_uptodate(bh);
2441 unlock_buffer(bh);
2442 goto has_buffer;
2443 }
2444 }
2445
2446make_io:
2447 /*
2448 * There are other valid inodes in the buffer, this inode
2449 * has in-inode xattrs, or we don't have this inode in memory.
2450 * Read the block from disk.
2451 */
2452 get_bh(bh);
2453 bh->b_end_io = end_buffer_read_sync;
2454 submit_bh(READ, bh);
2455 wait_on_buffer(bh);
2456 if (!buffer_uptodate(bh)) {
2457 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2458 "unable to read inode block - "
2459 "inode=%lu, block=%lu",
2460 inode->i_ino, block);
2461 brelse(bh);
2462 return -EIO;
2463 }
2464 }
2465has_buffer:
2466 iloc->bh = bh;
2467 return 0;
2468}
2469
2470int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2471{
2472 /* We have all inode data except xattrs in memory here. */
2473 return __ext3_get_inode_loc(inode, iloc,
2474 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2475}
2476
2477void ext3_set_inode_flags(struct inode *inode)
2478{
2479 unsigned int flags = EXT3_I(inode)->i_flags;
2480
2481 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2482 if (flags & EXT3_SYNC_FL)
2483 inode->i_flags |= S_SYNC;
2484 if (flags & EXT3_APPEND_FL)
2485 inode->i_flags |= S_APPEND;
2486 if (flags & EXT3_IMMUTABLE_FL)
2487 inode->i_flags |= S_IMMUTABLE;
2488 if (flags & EXT3_NOATIME_FL)
2489 inode->i_flags |= S_NOATIME;
2490 if (flags & EXT3_DIRSYNC_FL)
2491 inode->i_flags |= S_DIRSYNC;
2492}
2493
2494void ext3_read_inode(struct inode * inode)
2495{
2496 struct ext3_iloc iloc;
2497 struct ext3_inode *raw_inode;
2498 struct ext3_inode_info *ei = EXT3_I(inode);
2499 struct buffer_head *bh;
2500 int block;
2501
2502#ifdef CONFIG_EXT3_FS_POSIX_ACL
2503 ei->i_acl = EXT3_ACL_NOT_CACHED;
2504 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2505#endif
2506 ei->i_block_alloc_info = NULL;
2507
2508 if (__ext3_get_inode_loc(inode, &iloc, 0))
2509 goto bad_inode;
2510 bh = iloc.bh;
2511 raw_inode = ext3_raw_inode(&iloc);
2512 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2513 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2514 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2515 if(!(test_opt (inode->i_sb, NO_UID32))) {
2516 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2517 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2518 }
2519 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2520 inode->i_size = le32_to_cpu(raw_inode->i_size);
2521 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2522 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2523 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2524 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2525
2526 ei->i_state = 0;
2527 ei->i_dir_start_lookup = 0;
2528 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2529 /* We now have enough fields to check if the inode was active or not.
2530 * This is needed because nfsd might try to access dead inodes
2531 * the test is that same one that e2fsck uses
2532 * NeilBrown 1999oct15
2533 */
2534 if (inode->i_nlink == 0) {
2535 if (inode->i_mode == 0 ||
2536 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2537 /* this inode is deleted */
2538 brelse (bh);
2539 goto bad_inode;
2540 }
2541 /* The only unlinked inodes we let through here have
2542 * valid i_mode and are being read by the orphan
2543 * recovery code: that's fine, we're about to complete
2544 * the process of deleting those. */
2545 }
2546 inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
2547 * (for stat), not the fs block
2548 * size */
2549 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2550 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2551#ifdef EXT3_FRAGMENTS
2552 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2553 ei->i_frag_no = raw_inode->i_frag;
2554 ei->i_frag_size = raw_inode->i_fsize;
2555#endif
2556 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2557 if (!S_ISREG(inode->i_mode)) {
2558 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2559 } else {
2560 inode->i_size |=
2561 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2562 }
2563 ei->i_disksize = inode->i_size;
2564 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2565 ei->i_block_group = iloc.block_group;
2566 /*
2567 * NOTE! The in-memory inode i_data array is in little-endian order
2568 * even on big-endian machines: we do NOT byteswap the block numbers!
2569 */
2570 for (block = 0; block < EXT3_N_BLOCKS; block++)
2571 ei->i_data[block] = raw_inode->i_block[block];
2572 INIT_LIST_HEAD(&ei->i_orphan);
2573
2574 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2575 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2576 /*
2577 * When mke2fs creates big inodes it does not zero out
2578 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2579 * so ignore those first few inodes.
2580 */
2581 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2582 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2583 EXT3_INODE_SIZE(inode->i_sb))
2584 goto bad_inode;
2585 if (ei->i_extra_isize == 0) {
2586 /* The extra space is currently unused. Use it. */
2587 ei->i_extra_isize = sizeof(struct ext3_inode) -
2588 EXT3_GOOD_OLD_INODE_SIZE;
2589 } else {
2590 __le32 *magic = (void *)raw_inode +
2591 EXT3_GOOD_OLD_INODE_SIZE +
2592 ei->i_extra_isize;
2593 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2594 ei->i_state |= EXT3_STATE_XATTR;
2595 }
2596 } else
2597 ei->i_extra_isize = 0;
2598
2599 if (S_ISREG(inode->i_mode)) {
2600 inode->i_op = &ext3_file_inode_operations;
2601 inode->i_fop = &ext3_file_operations;
2602 ext3_set_aops(inode);
2603 } else if (S_ISDIR(inode->i_mode)) {
2604 inode->i_op = &ext3_dir_inode_operations;
2605 inode->i_fop = &ext3_dir_operations;
2606 } else if (S_ISLNK(inode->i_mode)) {
2607 if (ext3_inode_is_fast_symlink(inode))
2608 inode->i_op = &ext3_fast_symlink_inode_operations;
2609 else {
2610 inode->i_op = &ext3_symlink_inode_operations;
2611 ext3_set_aops(inode);
2612 }
2613 } else {
2614 inode->i_op = &ext3_special_inode_operations;
2615 if (raw_inode->i_block[0])
2616 init_special_inode(inode, inode->i_mode,
2617 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2618 else
2619 init_special_inode(inode, inode->i_mode,
2620 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2621 }
2622 brelse (iloc.bh);
2623 ext3_set_inode_flags(inode);
2624 return;
2625
2626bad_inode:
2627 make_bad_inode(inode);
2628 return;
2629}
2630
2631/*
2632 * Post the struct inode info into an on-disk inode location in the
2633 * buffer-cache. This gobbles the caller's reference to the
2634 * buffer_head in the inode location struct.
2635 *
2636 * The caller must have write access to iloc->bh.
2637 */
2638static int ext3_do_update_inode(handle_t *handle,
2639 struct inode *inode,
2640 struct ext3_iloc *iloc)
2641{
2642 struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2643 struct ext3_inode_info *ei = EXT3_I(inode);
2644 struct buffer_head *bh = iloc->bh;
2645 int err = 0, rc, block;
2646
2647 /* For fields not not tracking in the in-memory inode,
2648 * initialise them to zero for new inodes. */
2649 if (ei->i_state & EXT3_STATE_NEW)
2650 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2651
2652 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2653 if(!(test_opt(inode->i_sb, NO_UID32))) {
2654 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2655 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2656/*
2657 * Fix up interoperability with old kernels. Otherwise, old inodes get
2658 * re-used with the upper 16 bits of the uid/gid intact
2659 */
2660 if(!ei->i_dtime) {
2661 raw_inode->i_uid_high =
2662 cpu_to_le16(high_16_bits(inode->i_uid));
2663 raw_inode->i_gid_high =
2664 cpu_to_le16(high_16_bits(inode->i_gid));
2665 } else {
2666 raw_inode->i_uid_high = 0;
2667 raw_inode->i_gid_high = 0;
2668 }
2669 } else {
2670 raw_inode->i_uid_low =
2671 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2672 raw_inode->i_gid_low =
2673 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2674 raw_inode->i_uid_high = 0;
2675 raw_inode->i_gid_high = 0;
2676 }
2677 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2678 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2679 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2680 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2681 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2682 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2683 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2684 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2685#ifdef EXT3_FRAGMENTS
2686 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2687 raw_inode->i_frag = ei->i_frag_no;
2688 raw_inode->i_fsize = ei->i_frag_size;
2689#endif
2690 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2691 if (!S_ISREG(inode->i_mode)) {
2692 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2693 } else {
2694 raw_inode->i_size_high =
2695 cpu_to_le32(ei->i_disksize >> 32);
2696 if (ei->i_disksize > 0x7fffffffULL) {
2697 struct super_block *sb = inode->i_sb;
2698 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2699 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2700 EXT3_SB(sb)->s_es->s_rev_level ==
2701 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2702 /* If this is the first large file
2703 * created, add a flag to the superblock.
2704 */
2705 err = ext3_journal_get_write_access(handle,
2706 EXT3_SB(sb)->s_sbh);
2707 if (err)
2708 goto out_brelse;
2709 ext3_update_dynamic_rev(sb);
2710 EXT3_SET_RO_COMPAT_FEATURE(sb,
2711 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2712 sb->s_dirt = 1;
2713 handle->h_sync = 1;
2714 err = ext3_journal_dirty_metadata(handle,
2715 EXT3_SB(sb)->s_sbh);
2716 }
2717 }
2718 }
2719 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2720 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2721 if (old_valid_dev(inode->i_rdev)) {
2722 raw_inode->i_block[0] =
2723 cpu_to_le32(old_encode_dev(inode->i_rdev));
2724 raw_inode->i_block[1] = 0;
2725 } else {
2726 raw_inode->i_block[0] = 0;
2727 raw_inode->i_block[1] =
2728 cpu_to_le32(new_encode_dev(inode->i_rdev));
2729 raw_inode->i_block[2] = 0;
2730 }
2731 } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2732 raw_inode->i_block[block] = ei->i_data[block];
2733
2734 if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
2735 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2736
2737 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2738 rc = ext3_journal_dirty_metadata(handle, bh);
2739 if (!err)
2740 err = rc;
2741 ei->i_state &= ~EXT3_STATE_NEW;
2742
2743out_brelse:
2744 brelse (bh);
2745 ext3_std_error(inode->i_sb, err);
2746 return err;
2747}
2748
2749/*
2750 * ext3_write_inode()
2751 *
2752 * We are called from a few places:
2753 *
2754 * - Within generic_file_write() for O_SYNC files.
2755 * Here, there will be no transaction running. We wait for any running
2756 * trasnaction to commit.
2757 *
2758 * - Within sys_sync(), kupdate and such.
2759 * We wait on commit, if tol to.
2760 *
2761 * - Within prune_icache() (PF_MEMALLOC == true)
2762 * Here we simply return. We can't afford to block kswapd on the
2763 * journal commit.
2764 *
2765 * In all cases it is actually safe for us to return without doing anything,
2766 * because the inode has been copied into a raw inode buffer in
2767 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2768 * knfsd.
2769 *
2770 * Note that we are absolutely dependent upon all inode dirtiers doing the
2771 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2772 * which we are interested.
2773 *
2774 * It would be a bug for them to not do this. The code:
2775 *
2776 * mark_inode_dirty(inode)
2777 * stuff();
2778 * inode->i_size = expr;
2779 *
2780 * is in error because a kswapd-driven write_inode() could occur while
2781 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2782 * will no longer be on the superblock's dirty inode list.
2783 */
2784int ext3_write_inode(struct inode *inode, int wait)
2785{
2786 if (current->flags & PF_MEMALLOC)
2787 return 0;
2788
2789 if (ext3_journal_current_handle()) {
2790 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2791 dump_stack();
2792 return -EIO;
2793 }
2794
2795 if (!wait)
2796 return 0;
2797
2798 return ext3_force_commit(inode->i_sb);
2799}
2800
2801/*
2802 * ext3_setattr()
2803 *
2804 * Called from notify_change.
2805 *
2806 * We want to trap VFS attempts to truncate the file as soon as
2807 * possible. In particular, we want to make sure that when the VFS
2808 * shrinks i_size, we put the inode on the orphan list and modify
2809 * i_disksize immediately, so that during the subsequent flushing of
2810 * dirty pages and freeing of disk blocks, we can guarantee that any
2811 * commit will leave the blocks being flushed in an unused state on
2812 * disk. (On recovery, the inode will get truncated and the blocks will
2813 * be freed, so we have a strong guarantee that no future commit will
2814 * leave these blocks visible to the user.)
2815 *
2816 * Called with inode->sem down.
2817 */
2818int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2819{
2820 struct inode *inode = dentry->d_inode;
2821 int error, rc = 0;
2822 const unsigned int ia_valid = attr->ia_valid;
2823
2824 error = inode_change_ok(inode, attr);
2825 if (error)
2826 return error;
2827
2828 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2829 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2830 handle_t *handle;
2831
2832 /* (user+group)*(old+new) structure, inode write (sb,
2833 * inode block, ? - but truncate inode update has it) */
2834 handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
2835 if (IS_ERR(handle)) {
2836 error = PTR_ERR(handle);
2837 goto err_out;
2838 }
2839 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2840 if (error) {
2841 ext3_journal_stop(handle);
2842 return error;
2843 }
2844 /* Update corresponding info in inode so that everything is in
2845 * one transaction */
2846 if (attr->ia_valid & ATTR_UID)
2847 inode->i_uid = attr->ia_uid;
2848 if (attr->ia_valid & ATTR_GID)
2849 inode->i_gid = attr->ia_gid;
2850 error = ext3_mark_inode_dirty(handle, inode);
2851 ext3_journal_stop(handle);
2852 }
2853
2854 if (S_ISREG(inode->i_mode) &&
2855 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2856 handle_t *handle;
2857
2858 handle = ext3_journal_start(inode, 3);
2859 if (IS_ERR(handle)) {
2860 error = PTR_ERR(handle);
2861 goto err_out;
2862 }
2863
2864 error = ext3_orphan_add(handle, inode);
2865 EXT3_I(inode)->i_disksize = attr->ia_size;
2866 rc = ext3_mark_inode_dirty(handle, inode);
2867 if (!error)
2868 error = rc;
2869 ext3_journal_stop(handle);
2870 }
2871
2872 rc = inode_setattr(inode, attr);
2873
2874 /* If inode_setattr's call to ext3_truncate failed to get a
2875 * transaction handle at all, we need to clean up the in-core
2876 * orphan list manually. */
2877 if (inode->i_nlink)
2878 ext3_orphan_del(NULL, inode);
2879
2880 if (!rc && (ia_valid & ATTR_MODE))
2881 rc = ext3_acl_chmod(inode);
2882
2883err_out:
2884 ext3_std_error(inode->i_sb, error);
2885 if (!error)
2886 error = rc;
2887 return error;
2888}
2889
2890
2891/*
2892 * akpm: how many blocks doth make a writepage()?
2893 *
2894 * With N blocks per page, it may be:
2895 * N data blocks
2896 * 2 indirect block
2897 * 2 dindirect
2898 * 1 tindirect
2899 * N+5 bitmap blocks (from the above)
2900 * N+5 group descriptor summary blocks
2901 * 1 inode block
2902 * 1 superblock.
2903 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2904 *
2905 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2906 *
2907 * With ordered or writeback data it's the same, less the N data blocks.
2908 *
2909 * If the inode's direct blocks can hold an integral number of pages then a
2910 * page cannot straddle two indirect blocks, and we can only touch one indirect
2911 * and dindirect block, and the "5" above becomes "3".
2912 *
2913 * This still overestimates under most circumstances. If we were to pass the
2914 * start and end offsets in here as well we could do block_to_path() on each
2915 * block and work out the exact number of indirects which are touched. Pah.
2916 */
2917
2918static int ext3_writepage_trans_blocks(struct inode *inode)
2919{
2920 int bpp = ext3_journal_blocks_per_page(inode);
2921 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2922 int ret;
2923
2924 if (ext3_should_journal_data(inode))
2925 ret = 3 * (bpp + indirects) + 2;
2926 else
2927 ret = 2 * (bpp + indirects) + 2;
2928
2929#ifdef CONFIG_QUOTA
2930 /* We know that structure was already allocated during DQUOT_INIT so
2931 * we will be updating only the data blocks + inodes */
2932 ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
2933#endif
2934
2935 return ret;
2936}
2937
2938/*
2939 * The caller must have previously called ext3_reserve_inode_write().
2940 * Give this, we know that the caller already has write access to iloc->bh.
2941 */
2942int ext3_mark_iloc_dirty(handle_t *handle,
2943 struct inode *inode, struct ext3_iloc *iloc)
2944{
2945 int err = 0;
2946
2947 /* the do_update_inode consumes one bh->b_count */
2948 get_bh(iloc->bh);
2949
2950 /* ext3_do_update_inode() does journal_dirty_metadata */
2951 err = ext3_do_update_inode(handle, inode, iloc);
2952 put_bh(iloc->bh);
2953 return err;
2954}
2955
2956/*
2957 * On success, We end up with an outstanding reference count against
2958 * iloc->bh. This _must_ be cleaned up later.
2959 */
2960
2961int
2962ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
2963 struct ext3_iloc *iloc)
2964{
2965 int err = 0;
2966 if (handle) {
2967 err = ext3_get_inode_loc(inode, iloc);
2968 if (!err) {
2969 BUFFER_TRACE(iloc->bh, "get_write_access");
2970 err = ext3_journal_get_write_access(handle, iloc->bh);
2971 if (err) {
2972 brelse(iloc->bh);
2973 iloc->bh = NULL;
2974 }
2975 }
2976 }
2977 ext3_std_error(inode->i_sb, err);
2978 return err;
2979}
2980
2981/*
2982 * akpm: What we do here is to mark the in-core inode as clean
2983 * with respect to inode dirtiness (it may still be data-dirty).
2984 * This means that the in-core inode may be reaped by prune_icache
2985 * without having to perform any I/O. This is a very good thing,
2986 * because *any* task may call prune_icache - even ones which
2987 * have a transaction open against a different journal.
2988 *
2989 * Is this cheating? Not really. Sure, we haven't written the
2990 * inode out, but prune_icache isn't a user-visible syncing function.
2991 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2992 * we start and wait on commits.
2993 *
2994 * Is this efficient/effective? Well, we're being nice to the system
2995 * by cleaning up our inodes proactively so they can be reaped
2996 * without I/O. But we are potentially leaving up to five seconds'
2997 * worth of inodes floating about which prune_icache wants us to
2998 * write out. One way to fix that would be to get prune_icache()
2999 * to do a write_super() to free up some memory. It has the desired
3000 * effect.
3001 */
3002int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3003{
3004 struct ext3_iloc iloc;
3005 int err;
3006
3007 might_sleep();
3008 err = ext3_reserve_inode_write(handle, inode, &iloc);
3009 if (!err)
3010 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3011 return err;
3012}
3013
3014/*
3015 * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
3016 *
3017 * We're really interested in the case where a file is being extended.
3018 * i_size has been changed by generic_commit_write() and we thus need
3019 * to include the updated inode in the current transaction.
3020 *
3021 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3022 * are allocated to the file.
3023 *
3024 * If the inode is marked synchronous, we don't honour that here - doing
3025 * so would cause a commit on atime updates, which we don't bother doing.
3026 * We handle synchronous inodes at the highest possible level.
3027 */
3028void ext3_dirty_inode(struct inode *inode)
3029{
3030 handle_t *current_handle = ext3_journal_current_handle();
3031 handle_t *handle;
3032
3033 handle = ext3_journal_start(inode, 2);
3034 if (IS_ERR(handle))
3035 goto out;
3036 if (current_handle &&
3037 current_handle->h_transaction != handle->h_transaction) {
3038 /* This task has a transaction open against a different fs */
3039 printk(KERN_EMERG "%s: transactions do not match!\n",
3040 __FUNCTION__);
3041 } else {
3042 jbd_debug(5, "marking dirty. outer handle=%p\n",
3043 current_handle);
3044 ext3_mark_inode_dirty(handle, inode);
3045 }
3046 ext3_journal_stop(handle);
3047out:
3048 return;
3049}
3050
3051#ifdef AKPM
3052/*
3053 * Bind an inode's backing buffer_head into this transaction, to prevent
3054 * it from being flushed to disk early. Unlike
3055 * ext3_reserve_inode_write, this leaves behind no bh reference and
3056 * returns no iloc structure, so the caller needs to repeat the iloc
3057 * lookup to mark the inode dirty later.
3058 */
3059static inline int
3060ext3_pin_inode(handle_t *handle, struct inode *inode)
3061{
3062 struct ext3_iloc iloc;
3063
3064 int err = 0;
3065 if (handle) {
3066 err = ext3_get_inode_loc(inode, &iloc);
3067 if (!err) {
3068 BUFFER_TRACE(iloc.bh, "get_write_access");
3069 err = journal_get_write_access(handle, iloc.bh);
3070 if (!err)
3071 err = ext3_journal_dirty_metadata(handle,
3072 iloc.bh);
3073 brelse(iloc.bh);
3074 }
3075 }
3076 ext3_std_error(inode->i_sb, err);
3077 return err;
3078}
3079#endif
3080
3081int ext3_change_inode_journal_flag(struct inode *inode, int val)
3082{
3083 journal_t *journal;
3084 handle_t *handle;
3085 int err;
3086
3087 /*
3088 * We have to be very careful here: changing a data block's
3089 * journaling status dynamically is dangerous. If we write a
3090 * data block to the journal, change the status and then delete
3091 * that block, we risk forgetting to revoke the old log record
3092 * from the journal and so a subsequent replay can corrupt data.
3093 * So, first we make sure that the journal is empty and that
3094 * nobody is changing anything.
3095 */
3096
3097 journal = EXT3_JOURNAL(inode);
3098 if (is_journal_aborted(journal) || IS_RDONLY(inode))
3099 return -EROFS;
3100
3101 journal_lock_updates(journal);
3102 journal_flush(journal);
3103
3104 /*
3105 * OK, there are no updates running now, and all cached data is
3106 * synced to disk. We are now in a completely consistent state
3107 * which doesn't have anything in the journal, and we know that
3108 * no filesystem updates are running, so it is safe to modify
3109 * the inode's in-core data-journaling state flag now.
3110 */
3111
3112 if (val)
3113 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3114 else
3115 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3116 ext3_set_aops(inode);
3117
3118 journal_unlock_updates(journal);
3119
3120 /* Finally we can mark the inode as dirty. */
3121
3122 handle = ext3_journal_start(inode, 1);
3123 if (IS_ERR(handle))
3124 return PTR_ERR(handle);
3125
3126 err = ext3_mark_inode_dirty(handle, inode);
3127 handle->h_sync = 1;
3128 ext3_journal_stop(handle);
3129 ext3_std_error(inode->i_sb, err);
3130
3131 return err;
3132}
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
new file mode 100644
index 000000000000..706d68608381
--- /dev/null
+++ b/fs/ext3/ioctl.c
@@ -0,0 +1,243 @@
1/*
2 * linux/fs/ext3/ioctl.c
3 *
4 * Copyright (C) 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/fs.h>
11#include <linux/jbd.h>
12#include <linux/ext3_fs.h>
13#include <linux/ext3_jbd.h>
14#include <linux/time.h>
15#include <asm/uaccess.h>
16
17
18int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
19 unsigned long arg)
20{
21 struct ext3_inode_info *ei = EXT3_I(inode);
22 unsigned int flags;
23 unsigned short rsv_window_size;
24
25 ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
26
27 switch (cmd) {
28 case EXT3_IOC_GETFLAGS:
29 flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
30 return put_user(flags, (int __user *) arg);
31 case EXT3_IOC_SETFLAGS: {
32 handle_t *handle = NULL;
33 int err;
34 struct ext3_iloc iloc;
35 unsigned int oldflags;
36 unsigned int jflag;
37
38 if (IS_RDONLY(inode))
39 return -EROFS;
40
41 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
42 return -EACCES;
43
44 if (get_user(flags, (int __user *) arg))
45 return -EFAULT;
46
47 if (!S_ISDIR(inode->i_mode))
48 flags &= ~EXT3_DIRSYNC_FL;
49
50 oldflags = ei->i_flags;
51
52 /* The JOURNAL_DATA flag is modifiable only by root */
53 jflag = flags & EXT3_JOURNAL_DATA_FL;
54
55 /*
56 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
57 * the relevant capability.
58 *
59 * This test looks nicer. Thanks to Pauline Middelink
60 */
61 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
62 if (!capable(CAP_LINUX_IMMUTABLE))
63 return -EPERM;
64 }
65
66 /*
67 * The JOURNAL_DATA flag can only be changed by
68 * the relevant capability.
69 */
70 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
71 if (!capable(CAP_SYS_RESOURCE))
72 return -EPERM;
73 }
74
75
76 handle = ext3_journal_start(inode, 1);
77 if (IS_ERR(handle))
78 return PTR_ERR(handle);
79 if (IS_SYNC(inode))
80 handle->h_sync = 1;
81 err = ext3_reserve_inode_write(handle, inode, &iloc);
82 if (err)
83 goto flags_err;
84
85 flags = flags & EXT3_FL_USER_MODIFIABLE;
86 flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
87 ei->i_flags = flags;
88
89 ext3_set_inode_flags(inode);
90 inode->i_ctime = CURRENT_TIME_SEC;
91
92 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
93flags_err:
94 ext3_journal_stop(handle);
95 if (err)
96 return err;
97
98 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
99 err = ext3_change_inode_journal_flag(inode, jflag);
100 return err;
101 }
102 case EXT3_IOC_GETVERSION:
103 case EXT3_IOC_GETVERSION_OLD:
104 return put_user(inode->i_generation, (int __user *) arg);
105 case EXT3_IOC_SETVERSION:
106 case EXT3_IOC_SETVERSION_OLD: {
107 handle_t *handle;
108 struct ext3_iloc iloc;
109 __u32 generation;
110 int err;
111
112 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
113 return -EPERM;
114 if (IS_RDONLY(inode))
115 return -EROFS;
116 if (get_user(generation, (int __user *) arg))
117 return -EFAULT;
118
119 handle = ext3_journal_start(inode, 1);
120 if (IS_ERR(handle))
121 return PTR_ERR(handle);
122 err = ext3_reserve_inode_write(handle, inode, &iloc);
123 if (err == 0) {
124 inode->i_ctime = CURRENT_TIME_SEC;
125 inode->i_generation = generation;
126 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
127 }
128 ext3_journal_stop(handle);
129 return err;
130 }
131#ifdef CONFIG_JBD_DEBUG
132 case EXT3_IOC_WAIT_FOR_READONLY:
133 /*
134 * This is racy - by the time we're woken up and running,
135 * the superblock could be released. And the module could
136 * have been unloaded. So sue me.
137 *
138 * Returns 1 if it slept, else zero.
139 */
140 {
141 struct super_block *sb = inode->i_sb;
142 DECLARE_WAITQUEUE(wait, current);
143 int ret = 0;
144
145 set_current_state(TASK_INTERRUPTIBLE);
146 add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
147 if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
148 schedule();
149 ret = 1;
150 }
151 remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
152 return ret;
153 }
154#endif
155 case EXT3_IOC_GETRSVSZ:
156 if (test_opt(inode->i_sb, RESERVATION)
157 && S_ISREG(inode->i_mode)
158 && ei->i_block_alloc_info) {
159 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
160 return put_user(rsv_window_size, (int __user *)arg);
161 }
162 return -ENOTTY;
163 case EXT3_IOC_SETRSVSZ: {
164
165 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
166 return -ENOTTY;
167
168 if (IS_RDONLY(inode))
169 return -EROFS;
170
171 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
172 return -EACCES;
173
174 if (get_user(rsv_window_size, (int __user *)arg))
175 return -EFAULT;
176
177 if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
178 rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
179
180 /*
181 * need to allocate reservation structure for this inode
182 * before set the window size
183 */
184 down(&ei->truncate_sem);
185 if (!ei->i_block_alloc_info)
186 ext3_init_block_alloc_info(inode);
187
188 if (ei->i_block_alloc_info){
189 struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
190 rsv->rsv_goal_size = rsv_window_size;
191 }
192 up(&ei->truncate_sem);
193 return 0;
194 }
195 case EXT3_IOC_GROUP_EXTEND: {
196 unsigned long n_blocks_count;
197 struct super_block *sb = inode->i_sb;
198 int err;
199
200 if (!capable(CAP_SYS_RESOURCE))
201 return -EPERM;
202
203 if (IS_RDONLY(inode))
204 return -EROFS;
205
206 if (get_user(n_blocks_count, (__u32 __user *)arg))
207 return -EFAULT;
208
209 err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
210 journal_lock_updates(EXT3_SB(sb)->s_journal);
211 journal_flush(EXT3_SB(sb)->s_journal);
212 journal_unlock_updates(EXT3_SB(sb)->s_journal);
213
214 return err;
215 }
216 case EXT3_IOC_GROUP_ADD: {
217 struct ext3_new_group_data input;
218 struct super_block *sb = inode->i_sb;
219 int err;
220
221 if (!capable(CAP_SYS_RESOURCE))
222 return -EPERM;
223
224 if (IS_RDONLY(inode))
225 return -EROFS;
226
227 if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
228 sizeof(input)))
229 return -EFAULT;
230
231 err = ext3_group_add(sb, &input);
232 journal_lock_updates(EXT3_SB(sb)->s_journal);
233 journal_flush(EXT3_SB(sb)->s_journal);
234 journal_unlock_updates(EXT3_SB(sb)->s_journal);
235
236 return err;
237 }
238
239
240 default:
241 return -ENOTTY;
242 }
243}
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
new file mode 100644
index 000000000000..79742d824a0a
--- /dev/null
+++ b/fs/ext3/namei.c
@@ -0,0 +1,2378 @@
1/*
2 * linux/fs/ext3/namei.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/namei.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 * Directory entry file type support and forward compatibility hooks
18 * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
19 * Hash Tree Directory indexing (c)
20 * Daniel Phillips, 2001
21 * Hash Tree Directory indexing porting
22 * Christopher Li, 2002
23 * Hash Tree Directory indexing cleanup
24 * Theodore Ts'o, 2002
25 */
26
27#include <linux/fs.h>
28#include <linux/pagemap.h>
29#include <linux/jbd.h>
30#include <linux/time.h>
31#include <linux/ext3_fs.h>
32#include <linux/ext3_jbd.h>
33#include <linux/fcntl.h>
34#include <linux/stat.h>
35#include <linux/string.h>
36#include <linux/quotaops.h>
37#include <linux/buffer_head.h>
38#include <linux/smp_lock.h>
39#include "xattr.h"
40#include "acl.h"
41
42/*
43 * define how far ahead to read directories while searching them.
44 */
45#define NAMEI_RA_CHUNKS 2
46#define NAMEI_RA_BLOCKS 4
47#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
48#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
49
50static struct buffer_head *ext3_append(handle_t *handle,
51 struct inode *inode,
52 u32 *block, int *err)
53{
54 struct buffer_head *bh;
55
56 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
57
58 if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
59 inode->i_size += inode->i_sb->s_blocksize;
60 EXT3_I(inode)->i_disksize = inode->i_size;
61 ext3_journal_get_write_access(handle,bh);
62 }
63 return bh;
64}
65
66#ifndef assert
67#define assert(test) J_ASSERT(test)
68#endif
69
70#ifndef swap
71#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
72#endif
73
74#ifdef DX_DEBUG
75#define dxtrace(command) command
76#else
77#define dxtrace(command)
78#endif
79
80struct fake_dirent
81{
82 __le32 inode;
83 __le16 rec_len;
84 u8 name_len;
85 u8 file_type;
86};
87
88struct dx_countlimit
89{
90 __le16 limit;
91 __le16 count;
92};
93
94struct dx_entry
95{
96 __le32 hash;
97 __le32 block;
98};
99
100/*
101 * dx_root_info is laid out so that if it should somehow get overlaid by a
102 * dirent the two low bits of the hash version will be zero. Therefore, the
103 * hash version mod 4 should never be 0. Sincerely, the paranoia department.
104 */
105
106struct dx_root
107{
108 struct fake_dirent dot;
109 char dot_name[4];
110 struct fake_dirent dotdot;
111 char dotdot_name[4];
112 struct dx_root_info
113 {
114 __le32 reserved_zero;
115 u8 hash_version;
116 u8 info_length; /* 8 */
117 u8 indirect_levels;
118 u8 unused_flags;
119 }
120 info;
121 struct dx_entry entries[0];
122};
123
124struct dx_node
125{
126 struct fake_dirent fake;
127 struct dx_entry entries[0];
128};
129
130
131struct dx_frame
132{
133 struct buffer_head *bh;
134 struct dx_entry *entries;
135 struct dx_entry *at;
136};
137
138struct dx_map_entry
139{
140 u32 hash;
141 u32 offs;
142};
143
144#ifdef CONFIG_EXT3_INDEX
145static inline unsigned dx_get_block (struct dx_entry *entry);
146static void dx_set_block (struct dx_entry *entry, unsigned value);
147static inline unsigned dx_get_hash (struct dx_entry *entry);
148static void dx_set_hash (struct dx_entry *entry, unsigned value);
149static unsigned dx_get_count (struct dx_entry *entries);
150static unsigned dx_get_limit (struct dx_entry *entries);
151static void dx_set_count (struct dx_entry *entries, unsigned value);
152static void dx_set_limit (struct dx_entry *entries, unsigned value);
153static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
154static unsigned dx_node_limit (struct inode *dir);
155static struct dx_frame *dx_probe(struct dentry *dentry,
156 struct inode *dir,
157 struct dx_hash_info *hinfo,
158 struct dx_frame *frame,
159 int *err);
160static void dx_release (struct dx_frame *frames);
161static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
162 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
163static void dx_sort_map(struct dx_map_entry *map, unsigned count);
164static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
165 struct dx_map_entry *offsets, int count);
166static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
167static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
168static int ext3_htree_next_block(struct inode *dir, __u32 hash,
169 struct dx_frame *frame,
170 struct dx_frame *frames,
171 __u32 *start_hash);
172static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
173 struct ext3_dir_entry_2 **res_dir, int *err);
174static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
175 struct inode *inode);
176
177/*
178 * Future: use high four bits of block for coalesce-on-delete flags
179 * Mask them off for now.
180 */
181
182static inline unsigned dx_get_block (struct dx_entry *entry)
183{
184 return le32_to_cpu(entry->block) & 0x00ffffff;
185}
186
187static inline void dx_set_block (struct dx_entry *entry, unsigned value)
188{
189 entry->block = cpu_to_le32(value);
190}
191
192static inline unsigned dx_get_hash (struct dx_entry *entry)
193{
194 return le32_to_cpu(entry->hash);
195}
196
197static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
198{
199 entry->hash = cpu_to_le32(value);
200}
201
202static inline unsigned dx_get_count (struct dx_entry *entries)
203{
204 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
205}
206
207static inline unsigned dx_get_limit (struct dx_entry *entries)
208{
209 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
210}
211
212static inline void dx_set_count (struct dx_entry *entries, unsigned value)
213{
214 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
215}
216
217static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
218{
219 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
220}
221
222static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
223{
224 unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
225 EXT3_DIR_REC_LEN(2) - infosize;
226 return 0? 20: entry_space / sizeof(struct dx_entry);
227}
228
229static inline unsigned dx_node_limit (struct inode *dir)
230{
231 unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
232 return 0? 22: entry_space / sizeof(struct dx_entry);
233}
234
235/*
236 * Debug
237 */
238#ifdef DX_DEBUG
239static void dx_show_index (char * label, struct dx_entry *entries)
240{
241 int i, n = dx_get_count (entries);
242 printk("%s index ", label);
243 for (i = 0; i < n; i++)
244 {
245 printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
246 }
247 printk("\n");
248}
249
250struct stats
251{
252 unsigned names;
253 unsigned space;
254 unsigned bcount;
255};
256
257static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
258 int size, int show_names)
259{
260 unsigned names = 0, space = 0;
261 char *base = (char *) de;
262 struct dx_hash_info h = *hinfo;
263
264 printk("names: ");
265 while ((char *) de < base + size)
266 {
267 if (de->inode)
268 {
269 if (show_names)
270 {
271 int len = de->name_len;
272 char *name = de->name;
273 while (len--) printk("%c", *name++);
274 ext3fs_dirhash(de->name, de->name_len, &h);
275 printk(":%x.%u ", h.hash,
276 ((char *) de - base));
277 }
278 space += EXT3_DIR_REC_LEN(de->name_len);
279 names++;
280 }
281 de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
282 }
283 printk("(%i)\n", names);
284 return (struct stats) { names, space, 1 };
285}
286
287struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
288 struct dx_entry *entries, int levels)
289{
290 unsigned blocksize = dir->i_sb->s_blocksize;
291 unsigned count = dx_get_count (entries), names = 0, space = 0, i;
292 unsigned bcount = 0;
293 struct buffer_head *bh;
294 int err;
295 printk("%i indexed blocks...\n", count);
296 for (i = 0; i < count; i++, entries++)
297 {
298 u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
299 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
300 struct stats stats;
301 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
302 if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
303 stats = levels?
304 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
305 dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
306 names += stats.names;
307 space += stats.space;
308 bcount += stats.bcount;
309 brelse (bh);
310 }
311 if (bcount)
312 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
313 names, space/bcount,(space/bcount)*100/blocksize);
314 return (struct stats) { names, space, bcount};
315}
316#endif /* DX_DEBUG */
317
318/*
319 * Probe for a directory leaf block to search.
320 *
321 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
322 * error in the directory index, and the caller should fall back to
323 * searching the directory normally. The callers of dx_probe **MUST**
324 * check for this error code, and make sure it never gets reflected
325 * back to userspace.
326 */
327static struct dx_frame *
328dx_probe(struct dentry *dentry, struct inode *dir,
329 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
330{
331 unsigned count, indirect;
332 struct dx_entry *at, *entries, *p, *q, *m;
333 struct dx_root *root;
334 struct buffer_head *bh;
335 struct dx_frame *frame = frame_in;
336 u32 hash;
337
338 frame->bh = NULL;
339 if (dentry)
340 dir = dentry->d_parent->d_inode;
341 if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
342 goto fail;
343 root = (struct dx_root *) bh->b_data;
344 if (root->info.hash_version != DX_HASH_TEA &&
345 root->info.hash_version != DX_HASH_HALF_MD4 &&
346 root->info.hash_version != DX_HASH_LEGACY) {
347 ext3_warning(dir->i_sb, __FUNCTION__,
348 "Unrecognised inode hash code %d",
349 root->info.hash_version);
350 brelse(bh);
351 *err = ERR_BAD_DX_DIR;
352 goto fail;
353 }
354 hinfo->hash_version = root->info.hash_version;
355 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
356 if (dentry)
357 ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
358 hash = hinfo->hash;
359
360 if (root->info.unused_flags & 1) {
361 ext3_warning(dir->i_sb, __FUNCTION__,
362 "Unimplemented inode hash flags: %#06x",
363 root->info.unused_flags);
364 brelse(bh);
365 *err = ERR_BAD_DX_DIR;
366 goto fail;
367 }
368
369 if ((indirect = root->info.indirect_levels) > 1) {
370 ext3_warning(dir->i_sb, __FUNCTION__,
371 "Unimplemented inode hash depth: %#06x",
372 root->info.indirect_levels);
373 brelse(bh);
374 *err = ERR_BAD_DX_DIR;
375 goto fail;
376 }
377
378 entries = (struct dx_entry *) (((char *)&root->info) +
379 root->info.info_length);
380 assert(dx_get_limit(entries) == dx_root_limit(dir,
381 root->info.info_length));
382 dxtrace (printk("Look up %x", hash));
383 while (1)
384 {
385 count = dx_get_count(entries);
386 assert (count && count <= dx_get_limit(entries));
387 p = entries + 1;
388 q = entries + count - 1;
389 while (p <= q)
390 {
391 m = p + (q - p)/2;
392 dxtrace(printk("."));
393 if (dx_get_hash(m) > hash)
394 q = m - 1;
395 else
396 p = m + 1;
397 }
398
399 if (0) // linear search cross check
400 {
401 unsigned n = count - 1;
402 at = entries;
403 while (n--)
404 {
405 dxtrace(printk(","));
406 if (dx_get_hash(++at) > hash)
407 {
408 at--;
409 break;
410 }
411 }
412 assert (at == p - 1);
413 }
414
415 at = p - 1;
416 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
417 frame->bh = bh;
418 frame->entries = entries;
419 frame->at = at;
420 if (!indirect--) return frame;
421 if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
422 goto fail2;
423 at = entries = ((struct dx_node *) bh->b_data)->entries;
424 assert (dx_get_limit(entries) == dx_node_limit (dir));
425 frame++;
426 }
427fail2:
428 while (frame >= frame_in) {
429 brelse(frame->bh);
430 frame--;
431 }
432fail:
433 return NULL;
434}
435
436static void dx_release (struct dx_frame *frames)
437{
438 if (frames[0].bh == NULL)
439 return;
440
441 if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
442 brelse(frames[1].bh);
443 brelse(frames[0].bh);
444}
445
446/*
447 * This function increments the frame pointer to search the next leaf
448 * block, and reads in the necessary intervening nodes if the search
449 * should be necessary. Whether or not the search is necessary is
450 * controlled by the hash parameter. If the hash value is even, then
451 * the search is only continued if the next block starts with that
452 * hash value. This is used if we are searching for a specific file.
453 *
454 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
455 *
456 * This function returns 1 if the caller should continue to search,
457 * or 0 if it should not. If there is an error reading one of the
458 * index blocks, it will a negative error code.
459 *
460 * If start_hash is non-null, it will be filled in with the starting
461 * hash of the next page.
462 */
463static int ext3_htree_next_block(struct inode *dir, __u32 hash,
464 struct dx_frame *frame,
465 struct dx_frame *frames,
466 __u32 *start_hash)
467{
468 struct dx_frame *p;
469 struct buffer_head *bh;
470 int err, num_frames = 0;
471 __u32 bhash;
472
473 p = frame;
474 /*
475 * Find the next leaf page by incrementing the frame pointer.
476 * If we run out of entries in the interior node, loop around and
477 * increment pointer in the parent node. When we break out of
478 * this loop, num_frames indicates the number of interior
479 * nodes need to be read.
480 */
481 while (1) {
482 if (++(p->at) < p->entries + dx_get_count(p->entries))
483 break;
484 if (p == frames)
485 return 0;
486 num_frames++;
487 p--;
488 }
489
490 /*
491 * If the hash is 1, then continue only if the next page has a
492 * continuation hash of any value. This is used for readdir
493 * handling. Otherwise, check to see if the hash matches the
494 * desired contiuation hash. If it doesn't, return since
495 * there's no point to read in the successive index pages.
496 */
497 bhash = dx_get_hash(p->at);
498 if (start_hash)
499 *start_hash = bhash;
500 if ((hash & 1) == 0) {
501 if ((bhash & ~1) != hash)
502 return 0;
503 }
504 /*
505 * If the hash is HASH_NB_ALWAYS, we always go to the next
506 * block so no check is necessary
507 */
508 while (num_frames--) {
509 if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
510 0, &err)))
511 return err; /* Failure */
512 p++;
513 brelse (p->bh);
514 p->bh = bh;
515 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
516 }
517 return 1;
518}
519
520
521/*
522 * p is at least 6 bytes before the end of page
523 */
524static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
525{
526 return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
527}
528
529/*
530 * This function fills a red-black tree with information from a
531 * directory block. It returns the number directory entries loaded
532 * into the tree. If there is an error it is returned in err.
533 */
534static int htree_dirblock_to_tree(struct file *dir_file,
535 struct inode *dir, int block,
536 struct dx_hash_info *hinfo,
537 __u32 start_hash, __u32 start_minor_hash)
538{
539 struct buffer_head *bh;
540 struct ext3_dir_entry_2 *de, *top;
541 int err, count = 0;
542
543 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
544 if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
545 return err;
546
547 de = (struct ext3_dir_entry_2 *) bh->b_data;
548 top = (struct ext3_dir_entry_2 *) ((char *) de +
549 dir->i_sb->s_blocksize -
550 EXT3_DIR_REC_LEN(0));
551 for (; de < top; de = ext3_next_entry(de)) {
552 ext3fs_dirhash(de->name, de->name_len, hinfo);
553 if ((hinfo->hash < start_hash) ||
554 ((hinfo->hash == start_hash) &&
555 (hinfo->minor_hash < start_minor_hash)))
556 continue;
557 if (de->inode == 0)
558 continue;
559 if ((err = ext3_htree_store_dirent(dir_file,
560 hinfo->hash, hinfo->minor_hash, de)) != 0) {
561 brelse(bh);
562 return err;
563 }
564 count++;
565 }
566 brelse(bh);
567 return count;
568}
569
570
571/*
572 * This function fills a red-black tree with information from a
573 * directory. We start scanning the directory in hash order, starting
574 * at start_hash and start_minor_hash.
575 *
576 * This function returns the number of entries inserted into the tree,
577 * or a negative error code.
578 */
579int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
580 __u32 start_minor_hash, __u32 *next_hash)
581{
582 struct dx_hash_info hinfo;
583 struct ext3_dir_entry_2 *de;
584 struct dx_frame frames[2], *frame;
585 struct inode *dir;
586 int block, err;
587 int count = 0;
588 int ret;
589 __u32 hashval;
590
591 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
592 start_minor_hash));
593 dir = dir_file->f_dentry->d_inode;
594 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
595 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
596 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
597 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
598 start_hash, start_minor_hash);
599 *next_hash = ~0;
600 return count;
601 }
602 hinfo.hash = start_hash;
603 hinfo.minor_hash = 0;
604 frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
605 if (!frame)
606 return err;
607
608 /* Add '.' and '..' from the htree header */
609 if (!start_hash && !start_minor_hash) {
610 de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
611 if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
612 goto errout;
613 count++;
614 }
615 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
616 de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
617 de = ext3_next_entry(de);
618 if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
619 goto errout;
620 count++;
621 }
622
623 while (1) {
624 block = dx_get_block(frame->at);
625 ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
626 start_hash, start_minor_hash);
627 if (ret < 0) {
628 err = ret;
629 goto errout;
630 }
631 count += ret;
632 hashval = ~0;
633 ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
634 frame, frames, &hashval);
635 *next_hash = hashval;
636 if (ret < 0) {
637 err = ret;
638 goto errout;
639 }
640 /*
641 * Stop if: (a) there are no more entries, or
642 * (b) we have inserted at least one entry and the
643 * next hash value is not a continuation
644 */
645 if ((ret == 0) ||
646 (count && ((hashval & 1) == 0)))
647 break;
648 }
649 dx_release(frames);
650 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
651 count, *next_hash));
652 return count;
653errout:
654 dx_release(frames);
655 return (err);
656}
657
658
659/*
660 * Directory block splitting, compacting
661 */
662
663static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
664 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
665{
666 int count = 0;
667 char *base = (char *) de;
668 struct dx_hash_info h = *hinfo;
669
670 while ((char *) de < base + size)
671 {
672 if (de->name_len && de->inode) {
673 ext3fs_dirhash(de->name, de->name_len, &h);
674 map_tail--;
675 map_tail->hash = h.hash;
676 map_tail->offs = (u32) ((char *) de - base);
677 count++;
678 cond_resched();
679 }
680 /* XXX: do we need to check rec_len == 0 case? -Chris */
681 de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
682 }
683 return count;
684}
685
686static void dx_sort_map (struct dx_map_entry *map, unsigned count)
687{
688 struct dx_map_entry *p, *q, *top = map + count - 1;
689 int more;
690 /* Combsort until bubble sort doesn't suck */
691 while (count > 2)
692 {
693 count = count*10/13;
694 if (count - 9 < 2) /* 9, 10 -> 11 */
695 count = 11;
696 for (p = top, q = p - count; q >= map; p--, q--)
697 if (p->hash < q->hash)
698 swap(*p, *q);
699 }
700 /* Garden variety bubble sort */
701 do {
702 more = 0;
703 q = top;
704 while (q-- > map)
705 {
706 if (q[1].hash >= q[0].hash)
707 continue;
708 swap(*(q+1), *q);
709 more = 1;
710 }
711 } while(more);
712}
713
714static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
715{
716 struct dx_entry *entries = frame->entries;
717 struct dx_entry *old = frame->at, *new = old + 1;
718 int count = dx_get_count(entries);
719
720 assert(count < dx_get_limit(entries));
721 assert(old < entries + count);
722 memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
723 dx_set_hash(new, hash);
724 dx_set_block(new, block);
725 dx_set_count(entries, count + 1);
726}
727#endif
728
729
730static void ext3_update_dx_flag(struct inode *inode)
731{
732 if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
733 EXT3_FEATURE_COMPAT_DIR_INDEX))
734 EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
735}
736
737/*
738 * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
739 *
740 * `len <= EXT3_NAME_LEN' is guaranteed by caller.
741 * `de != NULL' is guaranteed by caller.
742 */
743static inline int ext3_match (int len, const char * const name,
744 struct ext3_dir_entry_2 * de)
745{
746 if (len != de->name_len)
747 return 0;
748 if (!de->inode)
749 return 0;
750 return !memcmp(name, de->name, len);
751}
752
753/*
754 * Returns 0 if not found, -1 on failure, and 1 on success
755 */
756static inline int search_dirblock(struct buffer_head * bh,
757 struct inode *dir,
758 struct dentry *dentry,
759 unsigned long offset,
760 struct ext3_dir_entry_2 ** res_dir)
761{
762 struct ext3_dir_entry_2 * de;
763 char * dlimit;
764 int de_len;
765 const char *name = dentry->d_name.name;
766 int namelen = dentry->d_name.len;
767
768 de = (struct ext3_dir_entry_2 *) bh->b_data;
769 dlimit = bh->b_data + dir->i_sb->s_blocksize;
770 while ((char *) de < dlimit) {
771 /* this code is executed quadratically often */
772 /* do minimal checking `by hand' */
773
774 if ((char *) de + namelen <= dlimit &&
775 ext3_match (namelen, name, de)) {
776 /* found a match - just to be sure, do a full check */
777 if (!ext3_check_dir_entry("ext3_find_entry",
778 dir, de, bh, offset))
779 return -1;
780 *res_dir = de;
781 return 1;
782 }
783 /* prevent looping on a bad block */
784 de_len = le16_to_cpu(de->rec_len);
785 if (de_len <= 0)
786 return -1;
787 offset += de_len;
788 de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
789 }
790 return 0;
791}
792
793
794/*
795 * ext3_find_entry()
796 *
797 * finds an entry in the specified directory with the wanted name. It
798 * returns the cache buffer in which the entry was found, and the entry
799 * itself (as a parameter - res_dir). It does NOT read the inode of the
800 * entry - you'll have to do that yourself if you want to.
801 *
802 * The returned buffer_head has ->b_count elevated. The caller is expected
803 * to brelse() it when appropriate.
804 */
805static struct buffer_head * ext3_find_entry (struct dentry *dentry,
806 struct ext3_dir_entry_2 ** res_dir)
807{
808 struct super_block * sb;
809 struct buffer_head * bh_use[NAMEI_RA_SIZE];
810 struct buffer_head * bh, *ret = NULL;
811 unsigned long start, block, b;
812 int ra_max = 0; /* Number of bh's in the readahead
813 buffer, bh_use[] */
814 int ra_ptr = 0; /* Current index into readahead
815 buffer */
816 int num = 0;
817 int nblocks, i, err;
818 struct inode *dir = dentry->d_parent->d_inode;
819 int namelen;
820 const u8 *name;
821 unsigned blocksize;
822
823 *res_dir = NULL;
824 sb = dir->i_sb;
825 blocksize = sb->s_blocksize;
826 namelen = dentry->d_name.len;
827 name = dentry->d_name.name;
828 if (namelen > EXT3_NAME_LEN)
829 return NULL;
830#ifdef CONFIG_EXT3_INDEX
831 if (is_dx(dir)) {
832 bh = ext3_dx_find_entry(dentry, res_dir, &err);
833 /*
834 * On success, or if the error was file not found,
835 * return. Otherwise, fall back to doing a search the
836 * old fashioned way.
837 */
838 if (bh || (err != ERR_BAD_DX_DIR))
839 return bh;
840 dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
841 }
842#endif
843 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
844 start = EXT3_I(dir)->i_dir_start_lookup;
845 if (start >= nblocks)
846 start = 0;
847 block = start;
848restart:
849 do {
850 /*
851 * We deal with the read-ahead logic here.
852 */
853 if (ra_ptr >= ra_max) {
854 /* Refill the readahead buffer */
855 ra_ptr = 0;
856 b = block;
857 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
858 /*
859 * Terminate if we reach the end of the
860 * directory and must wrap, or if our
861 * search has finished at this block.
862 */
863 if (b >= nblocks || (num && block == start)) {
864 bh_use[ra_max] = NULL;
865 break;
866 }
867 num++;
868 bh = ext3_getblk(NULL, dir, b++, 0, &err);
869 bh_use[ra_max] = bh;
870 if (bh)
871 ll_rw_block(READ, 1, &bh);
872 }
873 }
874 if ((bh = bh_use[ra_ptr++]) == NULL)
875 goto next;
876 wait_on_buffer(bh);
877 if (!buffer_uptodate(bh)) {
878 /* read error, skip block & hope for the best */
879 ext3_error(sb, __FUNCTION__, "reading directory #%lu "
880 "offset %lu", dir->i_ino, block);
881 brelse(bh);
882 goto next;
883 }
884 i = search_dirblock(bh, dir, dentry,
885 block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
886 if (i == 1) {
887 EXT3_I(dir)->i_dir_start_lookup = block;
888 ret = bh;
889 goto cleanup_and_exit;
890 } else {
891 brelse(bh);
892 if (i < 0)
893 goto cleanup_and_exit;
894 }
895 next:
896 if (++block >= nblocks)
897 block = 0;
898 } while (block != start);
899
900 /*
901 * If the directory has grown while we were searching, then
902 * search the last part of the directory before giving up.
903 */
904 block = nblocks;
905 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
906 if (block < nblocks) {
907 start = 0;
908 goto restart;
909 }
910
911cleanup_and_exit:
912 /* Clean up the read-ahead blocks */
913 for (; ra_ptr < ra_max; ra_ptr++)
914 brelse (bh_use[ra_ptr]);
915 return ret;
916}
917
918#ifdef CONFIG_EXT3_INDEX
919static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
920 struct ext3_dir_entry_2 **res_dir, int *err)
921{
922 struct super_block * sb;
923 struct dx_hash_info hinfo;
924 u32 hash;
925 struct dx_frame frames[2], *frame;
926 struct ext3_dir_entry_2 *de, *top;
927 struct buffer_head *bh;
928 unsigned long block;
929 int retval;
930 int namelen = dentry->d_name.len;
931 const u8 *name = dentry->d_name.name;
932 struct inode *dir = dentry->d_parent->d_inode;
933
934 sb = dir->i_sb;
935 if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
936 return NULL;
937 hash = hinfo.hash;
938 do {
939 block = dx_get_block(frame->at);
940 if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
941 goto errout;
942 de = (struct ext3_dir_entry_2 *) bh->b_data;
943 top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
944 EXT3_DIR_REC_LEN(0));
945 for (; de < top; de = ext3_next_entry(de))
946 if (ext3_match (namelen, name, de)) {
947 if (!ext3_check_dir_entry("ext3_find_entry",
948 dir, de, bh,
949 (block<<EXT3_BLOCK_SIZE_BITS(sb))
950 +((char *)de - bh->b_data))) {
951 brelse (bh);
952 goto errout;
953 }
954 *res_dir = de;
955 dx_release (frames);
956 return bh;
957 }
958 brelse (bh);
959 /* Check to see if we should continue to search */
960 retval = ext3_htree_next_block(dir, hash, frame,
961 frames, NULL);
962 if (retval < 0) {
963 ext3_warning(sb, __FUNCTION__,
964 "error reading index page in directory #%lu",
965 dir->i_ino);
966 *err = retval;
967 goto errout;
968 }
969 } while (retval == 1);
970
971 *err = -ENOENT;
972errout:
973 dxtrace(printk("%s not found\n", name));
974 dx_release (frames);
975 return NULL;
976}
977#endif
978
979static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
980{
981 struct inode * inode;
982 struct ext3_dir_entry_2 * de;
983 struct buffer_head * bh;
984
985 if (dentry->d_name.len > EXT3_NAME_LEN)
986 return ERR_PTR(-ENAMETOOLONG);
987
988 bh = ext3_find_entry(dentry, &de);
989 inode = NULL;
990 if (bh) {
991 unsigned long ino = le32_to_cpu(de->inode);
992 brelse (bh);
993 inode = iget(dir->i_sb, ino);
994
995 if (!inode)
996 return ERR_PTR(-EACCES);
997 }
998 if (inode)
999 return d_splice_alias(inode, dentry);
1000 d_add(dentry, inode);
1001 return NULL;
1002}
1003
1004
1005struct dentry *ext3_get_parent(struct dentry *child)
1006{
1007 unsigned long ino;
1008 struct dentry *parent;
1009 struct inode *inode;
1010 struct dentry dotdot;
1011 struct ext3_dir_entry_2 * de;
1012 struct buffer_head *bh;
1013
1014 dotdot.d_name.name = "..";
1015 dotdot.d_name.len = 2;
1016 dotdot.d_parent = child; /* confusing, isn't it! */
1017
1018 bh = ext3_find_entry(&dotdot, &de);
1019 inode = NULL;
1020 if (!bh)
1021 return ERR_PTR(-ENOENT);
1022 ino = le32_to_cpu(de->inode);
1023 brelse(bh);
1024 inode = iget(child->d_inode->i_sb, ino);
1025
1026 if (!inode)
1027 return ERR_PTR(-EACCES);
1028
1029 parent = d_alloc_anon(inode);
1030 if (!parent) {
1031 iput(inode);
1032 parent = ERR_PTR(-ENOMEM);
1033 }
1034 return parent;
1035}
1036
1037#define S_SHIFT 12
1038static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
1039 [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE,
1040 [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR,
1041 [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV,
1042 [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV,
1043 [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO,
1044 [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK,
1045 [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK,
1046};
1047
1048static inline void ext3_set_de_type(struct super_block *sb,
1049 struct ext3_dir_entry_2 *de,
1050 umode_t mode) {
1051 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
1052 de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1053}
1054
1055#ifdef CONFIG_EXT3_INDEX
1056static struct ext3_dir_entry_2 *
1057dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1058{
1059 unsigned rec_len = 0;
1060
1061 while (count--) {
1062 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
1063 rec_len = EXT3_DIR_REC_LEN(de->name_len);
1064 memcpy (to, de, rec_len);
1065 ((struct ext3_dir_entry_2 *) to)->rec_len =
1066 cpu_to_le16(rec_len);
1067 de->inode = 0;
1068 map++;
1069 to += rec_len;
1070 }
1071 return (struct ext3_dir_entry_2 *) (to - rec_len);
1072}
1073
1074static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
1075{
1076 struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
1077 unsigned rec_len = 0;
1078
1079 prev = to = de;
1080 while ((char*)de < base + size) {
1081 next = (struct ext3_dir_entry_2 *) ((char *) de +
1082 le16_to_cpu(de->rec_len));
1083 if (de->inode && de->name_len) {
1084 rec_len = EXT3_DIR_REC_LEN(de->name_len);
1085 if (de > to)
1086 memmove(to, de, rec_len);
1087 to->rec_len = cpu_to_le16(rec_len);
1088 prev = to;
1089 to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
1090 }
1091 de = next;
1092 }
1093 return prev;
1094}
1095
1096static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1097 struct buffer_head **bh,struct dx_frame *frame,
1098 struct dx_hash_info *hinfo, int *error)
1099{
1100 unsigned blocksize = dir->i_sb->s_blocksize;
1101 unsigned count, continued;
1102 struct buffer_head *bh2;
1103 u32 newblock;
1104 u32 hash2;
1105 struct dx_map_entry *map;
1106 char *data1 = (*bh)->b_data, *data2;
1107 unsigned split;
1108 struct ext3_dir_entry_2 *de = NULL, *de2;
1109 int err;
1110
1111 bh2 = ext3_append (handle, dir, &newblock, error);
1112 if (!(bh2)) {
1113 brelse(*bh);
1114 *bh = NULL;
1115 goto errout;
1116 }
1117
1118 BUFFER_TRACE(*bh, "get_write_access");
1119 err = ext3_journal_get_write_access(handle, *bh);
1120 if (err) {
1121 journal_error:
1122 brelse(*bh);
1123 brelse(bh2);
1124 *bh = NULL;
1125 ext3_std_error(dir->i_sb, err);
1126 goto errout;
1127 }
1128 BUFFER_TRACE(frame->bh, "get_write_access");
1129 err = ext3_journal_get_write_access(handle, frame->bh);
1130 if (err)
1131 goto journal_error;
1132
1133 data2 = bh2->b_data;
1134
1135 /* create map in the end of data2 block */
1136 map = (struct dx_map_entry *) (data2 + blocksize);
1137 count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
1138 blocksize, hinfo, map);
1139 map -= count;
1140 split = count/2; // need to adjust to actual middle
1141 dx_sort_map (map, count);
1142 hash2 = map[split].hash;
1143 continued = hash2 == map[split - 1].hash;
1144 dxtrace(printk("Split block %i at %x, %i/%i\n",
1145 dx_get_block(frame->at), hash2, split, count-split));
1146
1147 /* Fancy dance to stay within two buffers */
1148 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1149 de = dx_pack_dirents(data1,blocksize);
1150 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1151 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
1152 dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
1153 dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
1154
1155 /* Which block gets the new entry? */
1156 if (hinfo->hash >= hash2)
1157 {
1158 swap(*bh, bh2);
1159 de = de2;
1160 }
1161 dx_insert_block (frame, hash2 + continued, newblock);
1162 err = ext3_journal_dirty_metadata (handle, bh2);
1163 if (err)
1164 goto journal_error;
1165 err = ext3_journal_dirty_metadata (handle, frame->bh);
1166 if (err)
1167 goto journal_error;
1168 brelse (bh2);
1169 dxtrace(dx_show_index ("frame", frame->entries));
1170errout:
1171 return de;
1172}
1173#endif
1174
1175
1176/*
1177 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1178 * it points to a directory entry which is guaranteed to be large
1179 * enough for new directory entry. If de is NULL, then
1180 * add_dirent_to_buf will attempt search the directory block for
1181 * space. It will return -ENOSPC if no space is available, and -EIO
1182 * and -EEXIST if directory entry already exists.
1183 *
1184 * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1185 * all other cases bh is released.
1186 */
1187static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1188 struct inode *inode, struct ext3_dir_entry_2 *de,
1189 struct buffer_head * bh)
1190{
1191 struct inode *dir = dentry->d_parent->d_inode;
1192 const char *name = dentry->d_name.name;
1193 int namelen = dentry->d_name.len;
1194 unsigned long offset = 0;
1195 unsigned short reclen;
1196 int nlen, rlen, err;
1197 char *top;
1198
1199 reclen = EXT3_DIR_REC_LEN(namelen);
1200 if (!de) {
1201 de = (struct ext3_dir_entry_2 *)bh->b_data;
1202 top = bh->b_data + dir->i_sb->s_blocksize - reclen;
1203 while ((char *) de <= top) {
1204 if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
1205 bh, offset)) {
1206 brelse (bh);
1207 return -EIO;
1208 }
1209 if (ext3_match (namelen, name, de)) {
1210 brelse (bh);
1211 return -EEXIST;
1212 }
1213 nlen = EXT3_DIR_REC_LEN(de->name_len);
1214 rlen = le16_to_cpu(de->rec_len);
1215 if ((de->inode? rlen - nlen: rlen) >= reclen)
1216 break;
1217 de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
1218 offset += rlen;
1219 }
1220 if ((char *) de > top)
1221 return -ENOSPC;
1222 }
1223 BUFFER_TRACE(bh, "get_write_access");
1224 err = ext3_journal_get_write_access(handle, bh);
1225 if (err) {
1226 ext3_std_error(dir->i_sb, err);
1227 brelse(bh);
1228 return err;
1229 }
1230
1231 /* By now the buffer is marked for journaling */
1232 nlen = EXT3_DIR_REC_LEN(de->name_len);
1233 rlen = le16_to_cpu(de->rec_len);
1234 if (de->inode) {
1235 struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
1236 de1->rec_len = cpu_to_le16(rlen - nlen);
1237 de->rec_len = cpu_to_le16(nlen);
1238 de = de1;
1239 }
1240 de->file_type = EXT3_FT_UNKNOWN;
1241 if (inode) {
1242 de->inode = cpu_to_le32(inode->i_ino);
1243 ext3_set_de_type(dir->i_sb, de, inode->i_mode);
1244 } else
1245 de->inode = 0;
1246 de->name_len = namelen;
1247 memcpy (de->name, name, namelen);
1248 /*
1249 * XXX shouldn't update any times until successful
1250 * completion of syscall, but too many callers depend
1251 * on this.
1252 *
1253 * XXX similarly, too many callers depend on
1254 * ext3_new_inode() setting the times, but error
1255 * recovery deletes the inode, so the worst that can
1256 * happen is that the times are slightly out of date
1257 * and/or different from the directory change time.
1258 */
1259 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
1260 ext3_update_dx_flag(dir);
1261 dir->i_version++;
1262 ext3_mark_inode_dirty(handle, dir);
1263 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1264 err = ext3_journal_dirty_metadata(handle, bh);
1265 if (err)
1266 ext3_std_error(dir->i_sb, err);
1267 brelse(bh);
1268 return 0;
1269}
1270
1271#ifdef CONFIG_EXT3_INDEX
1272/*
1273 * This converts a one block unindexed directory to a 3 block indexed
1274 * directory, and adds the dentry to the indexed directory.
1275 */
1276static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1277 struct inode *inode, struct buffer_head *bh)
1278{
1279 struct inode *dir = dentry->d_parent->d_inode;
1280 const char *name = dentry->d_name.name;
1281 int namelen = dentry->d_name.len;
1282 struct buffer_head *bh2;
1283 struct dx_root *root;
1284 struct dx_frame frames[2], *frame;
1285 struct dx_entry *entries;
1286 struct ext3_dir_entry_2 *de, *de2;
1287 char *data1, *top;
1288 unsigned len;
1289 int retval;
1290 unsigned blocksize;
1291 struct dx_hash_info hinfo;
1292 u32 block;
1293 struct fake_dirent *fde;
1294
1295 blocksize = dir->i_sb->s_blocksize;
1296 dxtrace(printk("Creating index\n"));
1297 retval = ext3_journal_get_write_access(handle, bh);
1298 if (retval) {
1299 ext3_std_error(dir->i_sb, retval);
1300 brelse(bh);
1301 return retval;
1302 }
1303 root = (struct dx_root *) bh->b_data;
1304
1305 bh2 = ext3_append (handle, dir, &block, &retval);
1306 if (!(bh2)) {
1307 brelse(bh);
1308 return retval;
1309 }
1310 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
1311 data1 = bh2->b_data;
1312
1313 /* The 0th block becomes the root, move the dirents out */
1314 fde = &root->dotdot;
1315 de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
1316 len = ((char *) root) + blocksize - (char *) de;
1317 memcpy (data1, de, len);
1318 de = (struct ext3_dir_entry_2 *) data1;
1319 top = data1 + len;
1320 while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
1321 de = de2;
1322 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1323 /* Initialize the root; the dot dirents already exist */
1324 de = (struct ext3_dir_entry_2 *) (&root->dotdot);
1325 de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
1326 memset (&root->info, 0, sizeof(root->info));
1327 root->info.info_length = sizeof(root->info);
1328 root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
1329 entries = root->entries;
1330 dx_set_block (entries, 1);
1331 dx_set_count (entries, 1);
1332 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
1333
1334 /* Initialize as for dx_probe */
1335 hinfo.hash_version = root->info.hash_version;
1336 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1337 ext3fs_dirhash(name, namelen, &hinfo);
1338 frame = frames;
1339 frame->entries = entries;
1340 frame->at = entries;
1341 frame->bh = bh;
1342 bh = bh2;
1343 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1344 dx_release (frames);
1345 if (!(de))
1346 return retval;
1347
1348 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1349}
1350#endif
1351
1352/*
1353 * ext3_add_entry()
1354 *
1355 * adds a file entry to the specified directory, using the same
1356 * semantics as ext3_find_entry(). It returns NULL if it failed.
1357 *
1358 * NOTE!! The inode part of 'de' is left at 0 - which means you
1359 * may not sleep between calling this and putting something into
1360 * the entry, as someone else might have used it while you slept.
1361 */
1362static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1363 struct inode *inode)
1364{
1365 struct inode *dir = dentry->d_parent->d_inode;
1366 unsigned long offset;
1367 struct buffer_head * bh;
1368 struct ext3_dir_entry_2 *de;
1369 struct super_block * sb;
1370 int retval;
1371#ifdef CONFIG_EXT3_INDEX
1372 int dx_fallback=0;
1373#endif
1374 unsigned blocksize;
1375 unsigned nlen, rlen;
1376 u32 block, blocks;
1377
1378 sb = dir->i_sb;
1379 blocksize = sb->s_blocksize;
1380 if (!dentry->d_name.len)
1381 return -EINVAL;
1382#ifdef CONFIG_EXT3_INDEX
1383 if (is_dx(dir)) {
1384 retval = ext3_dx_add_entry(handle, dentry, inode);
1385 if (!retval || (retval != ERR_BAD_DX_DIR))
1386 return retval;
1387 EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
1388 dx_fallback++;
1389 ext3_mark_inode_dirty(handle, dir);
1390 }
1391#endif
1392 blocks = dir->i_size >> sb->s_blocksize_bits;
1393 for (block = 0, offset = 0; block < blocks; block++) {
1394 bh = ext3_bread(handle, dir, block, 0, &retval);
1395 if(!bh)
1396 return retval;
1397 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1398 if (retval != -ENOSPC)
1399 return retval;
1400
1401#ifdef CONFIG_EXT3_INDEX
1402 if (blocks == 1 && !dx_fallback &&
1403 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
1404 return make_indexed_dir(handle, dentry, inode, bh);
1405#endif
1406 brelse(bh);
1407 }
1408 bh = ext3_append(handle, dir, &block, &retval);
1409 if (!bh)
1410 return retval;
1411 de = (struct ext3_dir_entry_2 *) bh->b_data;
1412 de->inode = 0;
1413 de->rec_len = cpu_to_le16(rlen = blocksize);
1414 nlen = 0;
1415 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1416}
1417
1418#ifdef CONFIG_EXT3_INDEX
1419/*
1420 * Returns 0 for success, or a negative error value
1421 */
1422static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1423 struct inode *inode)
1424{
1425 struct dx_frame frames[2], *frame;
1426 struct dx_entry *entries, *at;
1427 struct dx_hash_info hinfo;
1428 struct buffer_head * bh;
1429 struct inode *dir = dentry->d_parent->d_inode;
1430 struct super_block * sb = dir->i_sb;
1431 struct ext3_dir_entry_2 *de;
1432 int err;
1433
1434 frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
1435 if (!frame)
1436 return err;
1437 entries = frame->entries;
1438 at = frame->at;
1439
1440 if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1441 goto cleanup;
1442
1443 BUFFER_TRACE(bh, "get_write_access");
1444 err = ext3_journal_get_write_access(handle, bh);
1445 if (err)
1446 goto journal_error;
1447
1448 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1449 if (err != -ENOSPC) {
1450 bh = NULL;
1451 goto cleanup;
1452 }
1453
1454 /* Block full, should compress but for now just split */
1455 dxtrace(printk("using %u of %u node entries\n",
1456 dx_get_count(entries), dx_get_limit(entries)));
1457 /* Need to split index? */
1458 if (dx_get_count(entries) == dx_get_limit(entries)) {
1459 u32 newblock;
1460 unsigned icount = dx_get_count(entries);
1461 int levels = frame - frames;
1462 struct dx_entry *entries2;
1463 struct dx_node *node2;
1464 struct buffer_head *bh2;
1465
1466 if (levels && (dx_get_count(frames->entries) ==
1467 dx_get_limit(frames->entries))) {
1468 ext3_warning(sb, __FUNCTION__,
1469 "Directory index full!\n");
1470 err = -ENOSPC;
1471 goto cleanup;
1472 }
1473 bh2 = ext3_append (handle, dir, &newblock, &err);
1474 if (!(bh2))
1475 goto cleanup;
1476 node2 = (struct dx_node *)(bh2->b_data);
1477 entries2 = node2->entries;
1478 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
1479 node2->fake.inode = 0;
1480 BUFFER_TRACE(frame->bh, "get_write_access");
1481 err = ext3_journal_get_write_access(handle, frame->bh);
1482 if (err)
1483 goto journal_error;
1484 if (levels) {
1485 unsigned icount1 = icount/2, icount2 = icount - icount1;
1486 unsigned hash2 = dx_get_hash(entries + icount1);
1487 dxtrace(printk("Split index %i/%i\n", icount1, icount2));
1488
1489 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1490 err = ext3_journal_get_write_access(handle,
1491 frames[0].bh);
1492 if (err)
1493 goto journal_error;
1494
1495 memcpy ((char *) entries2, (char *) (entries + icount1),
1496 icount2 * sizeof(struct dx_entry));
1497 dx_set_count (entries, icount1);
1498 dx_set_count (entries2, icount2);
1499 dx_set_limit (entries2, dx_node_limit(dir));
1500
1501 /* Which index block gets the new entry? */
1502 if (at - entries >= icount1) {
1503 frame->at = at = at - entries - icount1 + entries2;
1504 frame->entries = entries = entries2;
1505 swap(frame->bh, bh2);
1506 }
1507 dx_insert_block (frames + 0, hash2, newblock);
1508 dxtrace(dx_show_index ("node", frames[1].entries));
1509 dxtrace(dx_show_index ("node",
1510 ((struct dx_node *) bh2->b_data)->entries));
1511 err = ext3_journal_dirty_metadata(handle, bh2);
1512 if (err)
1513 goto journal_error;
1514 brelse (bh2);
1515 } else {
1516 dxtrace(printk("Creating second level index...\n"));
1517 memcpy((char *) entries2, (char *) entries,
1518 icount * sizeof(struct dx_entry));
1519 dx_set_limit(entries2, dx_node_limit(dir));
1520
1521 /* Set up root */
1522 dx_set_count(entries, 1);
1523 dx_set_block(entries + 0, newblock);
1524 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1525
1526 /* Add new access path frame */
1527 frame = frames + 1;
1528 frame->at = at = at - entries + entries2;
1529 frame->entries = entries = entries2;
1530 frame->bh = bh2;
1531 err = ext3_journal_get_write_access(handle,
1532 frame->bh);
1533 if (err)
1534 goto journal_error;
1535 }
1536 ext3_journal_dirty_metadata(handle, frames[0].bh);
1537 }
1538 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1539 if (!de)
1540 goto cleanup;
1541 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1542 bh = NULL;
1543 goto cleanup;
1544
1545journal_error:
1546 ext3_std_error(dir->i_sb, err);
1547cleanup:
1548 if (bh)
1549 brelse(bh);
1550 dx_release(frames);
1551 return err;
1552}
1553#endif
1554
1555/*
1556 * ext3_delete_entry deletes a directory entry by merging it with the
1557 * previous entry
1558 */
1559static int ext3_delete_entry (handle_t *handle,
1560 struct inode * dir,
1561 struct ext3_dir_entry_2 * de_del,
1562 struct buffer_head * bh)
1563{
1564 struct ext3_dir_entry_2 * de, * pde;
1565 int i;
1566
1567 i = 0;
1568 pde = NULL;
1569 de = (struct ext3_dir_entry_2 *) bh->b_data;
1570 while (i < bh->b_size) {
1571 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
1572 return -EIO;
1573 if (de == de_del) {
1574 BUFFER_TRACE(bh, "get_write_access");
1575 ext3_journal_get_write_access(handle, bh);
1576 if (pde)
1577 pde->rec_len =
1578 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1579 le16_to_cpu(de->rec_len));
1580 else
1581 de->inode = 0;
1582 dir->i_version++;
1583 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1584 ext3_journal_dirty_metadata(handle, bh);
1585 return 0;
1586 }
1587 i += le16_to_cpu(de->rec_len);
1588 pde = de;
1589 de = (struct ext3_dir_entry_2 *)
1590 ((char *) de + le16_to_cpu(de->rec_len));
1591 }
1592 return -ENOENT;
1593}
1594
1595/*
1596 * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we
1597 * do not perform it in these functions. We perform it at the call site,
1598 * if it is needed.
1599 */
1600static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
1601{
1602 inode->i_nlink++;
1603}
1604
1605static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
1606{
1607 inode->i_nlink--;
1608}
1609
1610static int ext3_add_nondir(handle_t *handle,
1611 struct dentry *dentry, struct inode *inode)
1612{
1613 int err = ext3_add_entry(handle, dentry, inode);
1614 if (!err) {
1615 ext3_mark_inode_dirty(handle, inode);
1616 d_instantiate(dentry, inode);
1617 return 0;
1618 }
1619 ext3_dec_count(handle, inode);
1620 iput(inode);
1621 return err;
1622}
1623
1624/*
1625 * By the time this is called, we already have created
1626 * the directory cache entry for the new file, but it
1627 * is so far negative - it has no inode.
1628 *
1629 * If the create succeeds, we fill in the inode information
1630 * with d_instantiate().
1631 */
1632static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
1633 struct nameidata *nd)
1634{
1635 handle_t *handle;
1636 struct inode * inode;
1637 int err, retries = 0;
1638
1639retry:
1640 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
1641 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1642 2*EXT3_QUOTA_INIT_BLOCKS);
1643 if (IS_ERR(handle))
1644 return PTR_ERR(handle);
1645
1646 if (IS_DIRSYNC(dir))
1647 handle->h_sync = 1;
1648
1649 inode = ext3_new_inode (handle, dir, mode);
1650 err = PTR_ERR(inode);
1651 if (!IS_ERR(inode)) {
1652 inode->i_op = &ext3_file_inode_operations;
1653 inode->i_fop = &ext3_file_operations;
1654 ext3_set_aops(inode);
1655 err = ext3_add_nondir(handle, dentry, inode);
1656 }
1657 ext3_journal_stop(handle);
1658 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1659 goto retry;
1660 return err;
1661}
1662
1663static int ext3_mknod (struct inode * dir, struct dentry *dentry,
1664 int mode, dev_t rdev)
1665{
1666 handle_t *handle;
1667 struct inode *inode;
1668 int err, retries = 0;
1669
1670 if (!new_valid_dev(rdev))
1671 return -EINVAL;
1672
1673retry:
1674 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
1675 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1676 2*EXT3_QUOTA_INIT_BLOCKS);
1677 if (IS_ERR(handle))
1678 return PTR_ERR(handle);
1679
1680 if (IS_DIRSYNC(dir))
1681 handle->h_sync = 1;
1682
1683 inode = ext3_new_inode (handle, dir, mode);
1684 err = PTR_ERR(inode);
1685 if (!IS_ERR(inode)) {
1686 init_special_inode(inode, inode->i_mode, rdev);
1687#ifdef CONFIG_EXT3_FS_XATTR
1688 inode->i_op = &ext3_special_inode_operations;
1689#endif
1690 err = ext3_add_nondir(handle, dentry, inode);
1691 }
1692 ext3_journal_stop(handle);
1693 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1694 goto retry;
1695 return err;
1696}
1697
1698static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1699{
1700 handle_t *handle;
1701 struct inode * inode;
1702 struct buffer_head * dir_block;
1703 struct ext3_dir_entry_2 * de;
1704 int err, retries = 0;
1705
1706 if (dir->i_nlink >= EXT3_LINK_MAX)
1707 return -EMLINK;
1708
1709retry:
1710 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
1711 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1712 2*EXT3_QUOTA_INIT_BLOCKS);
1713 if (IS_ERR(handle))
1714 return PTR_ERR(handle);
1715
1716 if (IS_DIRSYNC(dir))
1717 handle->h_sync = 1;
1718
1719 inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
1720 err = PTR_ERR(inode);
1721 if (IS_ERR(inode))
1722 goto out_stop;
1723
1724 inode->i_op = &ext3_dir_inode_operations;
1725 inode->i_fop = &ext3_dir_operations;
1726 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1727 dir_block = ext3_bread (handle, inode, 0, 1, &err);
1728 if (!dir_block) {
1729 inode->i_nlink--; /* is this nlink == 0? */
1730 ext3_mark_inode_dirty(handle, inode);
1731 iput (inode);
1732 goto out_stop;
1733 }
1734 BUFFER_TRACE(dir_block, "get_write_access");
1735 ext3_journal_get_write_access(handle, dir_block);
1736 de = (struct ext3_dir_entry_2 *) dir_block->b_data;
1737 de->inode = cpu_to_le32(inode->i_ino);
1738 de->name_len = 1;
1739 de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
1740 strcpy (de->name, ".");
1741 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1742 de = (struct ext3_dir_entry_2 *)
1743 ((char *) de + le16_to_cpu(de->rec_len));
1744 de->inode = cpu_to_le32(dir->i_ino);
1745 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
1746 de->name_len = 2;
1747 strcpy (de->name, "..");
1748 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1749 inode->i_nlink = 2;
1750 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1751 ext3_journal_dirty_metadata(handle, dir_block);
1752 brelse (dir_block);
1753 ext3_mark_inode_dirty(handle, inode);
1754 err = ext3_add_entry (handle, dentry, inode);
1755 if (err) {
1756 inode->i_nlink = 0;
1757 ext3_mark_inode_dirty(handle, inode);
1758 iput (inode);
1759 goto out_stop;
1760 }
1761 dir->i_nlink++;
1762 ext3_update_dx_flag(dir);
1763 ext3_mark_inode_dirty(handle, dir);
1764 d_instantiate(dentry, inode);
1765out_stop:
1766 ext3_journal_stop(handle);
1767 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1768 goto retry;
1769 return err;
1770}
1771
1772/*
1773 * routine to check that the specified directory is empty (for rmdir)
1774 */
1775static int empty_dir (struct inode * inode)
1776{
1777 unsigned long offset;
1778 struct buffer_head * bh;
1779 struct ext3_dir_entry_2 * de, * de1;
1780 struct super_block * sb;
1781 int err = 0;
1782
1783 sb = inode->i_sb;
1784 if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
1785 !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
1786 if (err)
1787 ext3_error(inode->i_sb, __FUNCTION__,
1788 "error %d reading directory #%lu offset 0",
1789 err, inode->i_ino);
1790 else
1791 ext3_warning(inode->i_sb, __FUNCTION__,
1792 "bad directory (dir #%lu) - no data block",
1793 inode->i_ino);
1794 return 1;
1795 }
1796 de = (struct ext3_dir_entry_2 *) bh->b_data;
1797 de1 = (struct ext3_dir_entry_2 *)
1798 ((char *) de + le16_to_cpu(de->rec_len));
1799 if (le32_to_cpu(de->inode) != inode->i_ino ||
1800 !le32_to_cpu(de1->inode) ||
1801 strcmp (".", de->name) ||
1802 strcmp ("..", de1->name)) {
1803 ext3_warning (inode->i_sb, "empty_dir",
1804 "bad directory (dir #%lu) - no `.' or `..'",
1805 inode->i_ino);
1806 brelse (bh);
1807 return 1;
1808 }
1809 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
1810 de = (struct ext3_dir_entry_2 *)
1811 ((char *) de1 + le16_to_cpu(de1->rec_len));
1812 while (offset < inode->i_size ) {
1813 if (!bh ||
1814 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1815 err = 0;
1816 brelse (bh);
1817 bh = ext3_bread (NULL, inode,
1818 offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
1819 if (!bh) {
1820 if (err)
1821 ext3_error(sb, __FUNCTION__,
1822 "error %d reading directory"
1823 " #%lu offset %lu",
1824 err, inode->i_ino, offset);
1825 offset += sb->s_blocksize;
1826 continue;
1827 }
1828 de = (struct ext3_dir_entry_2 *) bh->b_data;
1829 }
1830 if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
1831 de = (struct ext3_dir_entry_2 *)(bh->b_data +
1832 sb->s_blocksize);
1833 offset = (offset | (sb->s_blocksize - 1)) + 1;
1834 continue;
1835 }
1836 if (le32_to_cpu(de->inode)) {
1837 brelse (bh);
1838 return 0;
1839 }
1840 offset += le16_to_cpu(de->rec_len);
1841 de = (struct ext3_dir_entry_2 *)
1842 ((char *) de + le16_to_cpu(de->rec_len));
1843 }
1844 brelse (bh);
1845 return 1;
1846}
1847
1848/* ext3_orphan_add() links an unlinked or truncated inode into a list of
1849 * such inodes, starting at the superblock, in case we crash before the
1850 * file is closed/deleted, or in case the inode truncate spans multiple
1851 * transactions and the last transaction is not recovered after a crash.
1852 *
1853 * At filesystem recovery time, we walk this list deleting unlinked
1854 * inodes and truncating linked inodes in ext3_orphan_cleanup().
1855 */
1856int ext3_orphan_add(handle_t *handle, struct inode *inode)
1857{
1858 struct super_block *sb = inode->i_sb;
1859 struct ext3_iloc iloc;
1860 int err = 0, rc;
1861
1862 lock_super(sb);
1863 if (!list_empty(&EXT3_I(inode)->i_orphan))
1864 goto out_unlock;
1865
1866 /* Orphan handling is only valid for files with data blocks
1867 * being truncated, or files being unlinked. */
1868
1869 /* @@@ FIXME: Observation from aviro:
1870 * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
1871 * here (on lock_super()), so race with ext3_link() which might bump
1872 * ->i_nlink. For, say it, character device. Not a regular file,
1873 * not a directory, not a symlink and ->i_nlink > 0.
1874 */
1875 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1876 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1877
1878 BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
1879 err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
1880 if (err)
1881 goto out_unlock;
1882
1883 err = ext3_reserve_inode_write(handle, inode, &iloc);
1884 if (err)
1885 goto out_unlock;
1886
1887 /* Insert this inode at the head of the on-disk orphan list... */
1888 NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
1889 EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1890 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1891 rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
1892 if (!err)
1893 err = rc;
1894
1895 /* Only add to the head of the in-memory list if all the
1896 * previous operations succeeded. If the orphan_add is going to
1897 * fail (possibly taking the journal offline), we can't risk
1898 * leaving the inode on the orphan list: stray orphan-list
1899 * entries can cause panics at unmount time.
1900 *
1901 * This is safe: on error we're going to ignore the orphan list
1902 * anyway on the next recovery. */
1903 if (!err)
1904 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1905
1906 jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
1907 jbd_debug(4, "orphan inode %ld will point to %d\n",
1908 inode->i_ino, NEXT_ORPHAN(inode));
1909out_unlock:
1910 unlock_super(sb);
1911 ext3_std_error(inode->i_sb, err);
1912 return err;
1913}
1914
1915/*
1916 * ext3_orphan_del() removes an unlinked or truncated inode from the list
1917 * of such inodes stored on disk, because it is finally being cleaned up.
1918 */
1919int ext3_orphan_del(handle_t *handle, struct inode *inode)
1920{
1921 struct list_head *prev;
1922 struct ext3_inode_info *ei = EXT3_I(inode);
1923 struct ext3_sb_info *sbi;
1924 unsigned long ino_next;
1925 struct ext3_iloc iloc;
1926 int err = 0;
1927
1928 lock_super(inode->i_sb);
1929 if (list_empty(&ei->i_orphan)) {
1930 unlock_super(inode->i_sb);
1931 return 0;
1932 }
1933
1934 ino_next = NEXT_ORPHAN(inode);
1935 prev = ei->i_orphan.prev;
1936 sbi = EXT3_SB(inode->i_sb);
1937
1938 jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
1939
1940 list_del_init(&ei->i_orphan);
1941
1942 /* If we're on an error path, we may not have a valid
1943 * transaction handle with which to update the orphan list on
1944 * disk, but we still need to remove the inode from the linked
1945 * list in memory. */
1946 if (!handle)
1947 goto out;
1948
1949 err = ext3_reserve_inode_write(handle, inode, &iloc);
1950 if (err)
1951 goto out_err;
1952
1953 if (prev == &sbi->s_orphan) {
1954 jbd_debug(4, "superblock will point to %lu\n", ino_next);
1955 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
1956 err = ext3_journal_get_write_access(handle, sbi->s_sbh);
1957 if (err)
1958 goto out_brelse;
1959 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
1960 err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
1961 } else {
1962 struct ext3_iloc iloc2;
1963 struct inode *i_prev =
1964 &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
1965
1966 jbd_debug(4, "orphan inode %lu will point to %lu\n",
1967 i_prev->i_ino, ino_next);
1968 err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
1969 if (err)
1970 goto out_brelse;
1971 NEXT_ORPHAN(i_prev) = ino_next;
1972 err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
1973 }
1974 if (err)
1975 goto out_brelse;
1976 NEXT_ORPHAN(inode) = 0;
1977 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
1978
1979out_err:
1980 ext3_std_error(inode->i_sb, err);
1981out:
1982 unlock_super(inode->i_sb);
1983 return err;
1984
1985out_brelse:
1986 brelse(iloc.bh);
1987 goto out_err;
1988}
1989
1990static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
1991{
1992 int retval;
1993 struct inode * inode;
1994 struct buffer_head * bh;
1995 struct ext3_dir_entry_2 * de;
1996 handle_t *handle;
1997
1998 /* Initialize quotas before so that eventual writes go in
1999 * separate transaction */
2000 DQUOT_INIT(dentry->d_inode);
2001 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
2002 if (IS_ERR(handle))
2003 return PTR_ERR(handle);
2004
2005 retval = -ENOENT;
2006 bh = ext3_find_entry (dentry, &de);
2007 if (!bh)
2008 goto end_rmdir;
2009
2010 if (IS_DIRSYNC(dir))
2011 handle->h_sync = 1;
2012
2013 inode = dentry->d_inode;
2014
2015 retval = -EIO;
2016 if (le32_to_cpu(de->inode) != inode->i_ino)
2017 goto end_rmdir;
2018
2019 retval = -ENOTEMPTY;
2020 if (!empty_dir (inode))
2021 goto end_rmdir;
2022
2023 retval = ext3_delete_entry(handle, dir, de, bh);
2024 if (retval)
2025 goto end_rmdir;
2026 if (inode->i_nlink != 2)
2027 ext3_warning (inode->i_sb, "ext3_rmdir",
2028 "empty directory has nlink!=2 (%d)",
2029 inode->i_nlink);
2030 inode->i_version++;
2031 inode->i_nlink = 0;
2032 /* There's no need to set i_disksize: the fact that i_nlink is
2033 * zero will ensure that the right thing happens during any
2034 * recovery. */
2035 inode->i_size = 0;
2036 ext3_orphan_add(handle, inode);
2037 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2038 ext3_mark_inode_dirty(handle, inode);
2039 dir->i_nlink--;
2040 ext3_update_dx_flag(dir);
2041 ext3_mark_inode_dirty(handle, dir);
2042
2043end_rmdir:
2044 ext3_journal_stop(handle);
2045 brelse (bh);
2046 return retval;
2047}
2048
2049static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2050{
2051 int retval;
2052 struct inode * inode;
2053 struct buffer_head * bh;
2054 struct ext3_dir_entry_2 * de;
2055 handle_t *handle;
2056
2057 /* Initialize quotas before so that eventual writes go
2058 * in separate transaction */
2059 DQUOT_INIT(dentry->d_inode);
2060 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
2061 if (IS_ERR(handle))
2062 return PTR_ERR(handle);
2063
2064 if (IS_DIRSYNC(dir))
2065 handle->h_sync = 1;
2066
2067 retval = -ENOENT;
2068 bh = ext3_find_entry (dentry, &de);
2069 if (!bh)
2070 goto end_unlink;
2071
2072 inode = dentry->d_inode;
2073
2074 retval = -EIO;
2075 if (le32_to_cpu(de->inode) != inode->i_ino)
2076 goto end_unlink;
2077
2078 if (!inode->i_nlink) {
2079 ext3_warning (inode->i_sb, "ext3_unlink",
2080 "Deleting nonexistent file (%lu), %d",
2081 inode->i_ino, inode->i_nlink);
2082 inode->i_nlink = 1;
2083 }
2084 retval = ext3_delete_entry(handle, dir, de, bh);
2085 if (retval)
2086 goto end_unlink;
2087 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2088 ext3_update_dx_flag(dir);
2089 ext3_mark_inode_dirty(handle, dir);
2090 inode->i_nlink--;
2091 if (!inode->i_nlink)
2092 ext3_orphan_add(handle, inode);
2093 inode->i_ctime = dir->i_ctime;
2094 ext3_mark_inode_dirty(handle, inode);
2095 retval = 0;
2096
2097end_unlink:
2098 ext3_journal_stop(handle);
2099 brelse (bh);
2100 return retval;
2101}
2102
2103static int ext3_symlink (struct inode * dir,
2104 struct dentry *dentry, const char * symname)
2105{
2106 handle_t *handle;
2107 struct inode * inode;
2108 int l, err, retries = 0;
2109
2110 l = strlen(symname)+1;
2111 if (l > dir->i_sb->s_blocksize)
2112 return -ENAMETOOLONG;
2113
2114retry:
2115 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
2116 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2117 2*EXT3_QUOTA_INIT_BLOCKS);
2118 if (IS_ERR(handle))
2119 return PTR_ERR(handle);
2120
2121 if (IS_DIRSYNC(dir))
2122 handle->h_sync = 1;
2123
2124 inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
2125 err = PTR_ERR(inode);
2126 if (IS_ERR(inode))
2127 goto out_stop;
2128
2129 if (l > sizeof (EXT3_I(inode)->i_data)) {
2130 inode->i_op = &ext3_symlink_inode_operations;
2131 ext3_set_aops(inode);
2132 /*
2133 * page_symlink() calls into ext3_prepare/commit_write.
2134 * We have a transaction open. All is sweetness. It also sets
2135 * i_size in generic_commit_write().
2136 */
2137 err = page_symlink(inode, symname, l);
2138 if (err) {
2139 ext3_dec_count(handle, inode);
2140 ext3_mark_inode_dirty(handle, inode);
2141 iput (inode);
2142 goto out_stop;
2143 }
2144 } else {
2145 inode->i_op = &ext3_fast_symlink_inode_operations;
2146 memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
2147 inode->i_size = l-1;
2148 }
2149 EXT3_I(inode)->i_disksize = inode->i_size;
2150 err = ext3_add_nondir(handle, dentry, inode);
2151out_stop:
2152 ext3_journal_stop(handle);
2153 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2154 goto retry;
2155 return err;
2156}
2157
2158static int ext3_link (struct dentry * old_dentry,
2159 struct inode * dir, struct dentry *dentry)
2160{
2161 handle_t *handle;
2162 struct inode *inode = old_dentry->d_inode;
2163 int err, retries = 0;
2164
2165 if (inode->i_nlink >= EXT3_LINK_MAX)
2166 return -EMLINK;
2167
2168retry:
2169 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
2170 EXT3_INDEX_EXTRA_TRANS_BLOCKS);
2171 if (IS_ERR(handle))
2172 return PTR_ERR(handle);
2173
2174 if (IS_DIRSYNC(dir))
2175 handle->h_sync = 1;
2176
2177 inode->i_ctime = CURRENT_TIME_SEC;
2178 ext3_inc_count(handle, inode);
2179 atomic_inc(&inode->i_count);
2180
2181 err = ext3_add_nondir(handle, dentry, inode);
2182 ext3_journal_stop(handle);
2183 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2184 goto retry;
2185 return err;
2186}
2187
2188#define PARENT_INO(buffer) \
2189 ((struct ext3_dir_entry_2 *) ((char *) buffer + \
2190 le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
2191
2192/*
2193 * Anybody can rename anything with this: the permission checks are left to the
2194 * higher-level routines.
2195 */
2196static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2197 struct inode * new_dir,struct dentry *new_dentry)
2198{
2199 handle_t *handle;
2200 struct inode * old_inode, * new_inode;
2201 struct buffer_head * old_bh, * new_bh, * dir_bh;
2202 struct ext3_dir_entry_2 * old_de, * new_de;
2203 int retval;
2204
2205 old_bh = new_bh = dir_bh = NULL;
2206
2207 /* Initialize quotas before so that eventual writes go
2208 * in separate transaction */
2209 if (new_dentry->d_inode)
2210 DQUOT_INIT(new_dentry->d_inode);
2211 handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
2212 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
2213 if (IS_ERR(handle))
2214 return PTR_ERR(handle);
2215
2216 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2217 handle->h_sync = 1;
2218
2219 old_bh = ext3_find_entry (old_dentry, &old_de);
2220 /*
2221 * Check for inode number is _not_ due to possible IO errors.
2222 * We might rmdir the source, keep it as pwd of some process
2223 * and merrily kill the link to whatever was created under the
2224 * same name. Goodbye sticky bit ;-<
2225 */
2226 old_inode = old_dentry->d_inode;
2227 retval = -ENOENT;
2228 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2229 goto end_rename;
2230
2231 new_inode = new_dentry->d_inode;
2232 new_bh = ext3_find_entry (new_dentry, &new_de);
2233 if (new_bh) {
2234 if (!new_inode) {
2235 brelse (new_bh);
2236 new_bh = NULL;
2237 }
2238 }
2239 if (S_ISDIR(old_inode->i_mode)) {
2240 if (new_inode) {
2241 retval = -ENOTEMPTY;
2242 if (!empty_dir (new_inode))
2243 goto end_rename;
2244 }
2245 retval = -EIO;
2246 dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
2247 if (!dir_bh)
2248 goto end_rename;
2249 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2250 goto end_rename;
2251 retval = -EMLINK;
2252 if (!new_inode && new_dir!=old_dir &&
2253 new_dir->i_nlink >= EXT3_LINK_MAX)
2254 goto end_rename;
2255 }
2256 if (!new_bh) {
2257 retval = ext3_add_entry (handle, new_dentry, old_inode);
2258 if (retval)
2259 goto end_rename;
2260 } else {
2261 BUFFER_TRACE(new_bh, "get write access");
2262 ext3_journal_get_write_access(handle, new_bh);
2263 new_de->inode = cpu_to_le32(old_inode->i_ino);
2264 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2265 EXT3_FEATURE_INCOMPAT_FILETYPE))
2266 new_de->file_type = old_de->file_type;
2267 new_dir->i_version++;
2268 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
2269 ext3_journal_dirty_metadata(handle, new_bh);
2270 brelse(new_bh);
2271 new_bh = NULL;
2272 }
2273
2274 /*
2275 * Like most other Unix systems, set the ctime for inodes on a
2276 * rename.
2277 */
2278 old_inode->i_ctime = CURRENT_TIME_SEC;
2279 ext3_mark_inode_dirty(handle, old_inode);
2280
2281 /*
2282 * ok, that's it
2283 */
2284 if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2285 old_de->name_len != old_dentry->d_name.len ||
2286 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2287 (retval = ext3_delete_entry(handle, old_dir,
2288 old_de, old_bh)) == -ENOENT) {
2289 /* old_de could have moved from under us during htree split, so
2290 * make sure that we are deleting the right entry. We might
2291 * also be pointing to a stale entry in the unused part of
2292 * old_bh so just checking inum and the name isn't enough. */
2293 struct buffer_head *old_bh2;
2294 struct ext3_dir_entry_2 *old_de2;
2295
2296 old_bh2 = ext3_find_entry(old_dentry, &old_de2);
2297 if (old_bh2) {
2298 retval = ext3_delete_entry(handle, old_dir,
2299 old_de2, old_bh2);
2300 brelse(old_bh2);
2301 }
2302 }
2303 if (retval) {
2304 ext3_warning(old_dir->i_sb, "ext3_rename",
2305 "Deleting old file (%lu), %d, error=%d",
2306 old_dir->i_ino, old_dir->i_nlink, retval);
2307 }
2308
2309 if (new_inode) {
2310 new_inode->i_nlink--;
2311 new_inode->i_ctime = CURRENT_TIME_SEC;
2312 }
2313 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
2314 ext3_update_dx_flag(old_dir);
2315 if (dir_bh) {
2316 BUFFER_TRACE(dir_bh, "get_write_access");
2317 ext3_journal_get_write_access(handle, dir_bh);
2318 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2319 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
2320 ext3_journal_dirty_metadata(handle, dir_bh);
2321 old_dir->i_nlink--;
2322 if (new_inode) {
2323 new_inode->i_nlink--;
2324 } else {
2325 new_dir->i_nlink++;
2326 ext3_update_dx_flag(new_dir);
2327 ext3_mark_inode_dirty(handle, new_dir);
2328 }
2329 }
2330 ext3_mark_inode_dirty(handle, old_dir);
2331 if (new_inode) {
2332 ext3_mark_inode_dirty(handle, new_inode);
2333 if (!new_inode->i_nlink)
2334 ext3_orphan_add(handle, new_inode);
2335 }
2336 retval = 0;
2337
2338end_rename:
2339 brelse (dir_bh);
2340 brelse (old_bh);
2341 brelse (new_bh);
2342 ext3_journal_stop(handle);
2343 return retval;
2344}
2345
2346/*
2347 * directories can handle most operations...
2348 */
2349struct inode_operations ext3_dir_inode_operations = {
2350 .create = ext3_create,
2351 .lookup = ext3_lookup,
2352 .link = ext3_link,
2353 .unlink = ext3_unlink,
2354 .symlink = ext3_symlink,
2355 .mkdir = ext3_mkdir,
2356 .rmdir = ext3_rmdir,
2357 .mknod = ext3_mknod,
2358 .rename = ext3_rename,
2359 .setattr = ext3_setattr,
2360#ifdef CONFIG_EXT3_FS_XATTR
2361 .setxattr = generic_setxattr,
2362 .getxattr = generic_getxattr,
2363 .listxattr = ext3_listxattr,
2364 .removexattr = generic_removexattr,
2365#endif
2366 .permission = ext3_permission,
2367};
2368
2369struct inode_operations ext3_special_inode_operations = {
2370 .setattr = ext3_setattr,
2371#ifdef CONFIG_EXT3_FS_XATTR
2372 .setxattr = generic_setxattr,
2373 .getxattr = generic_getxattr,
2374 .listxattr = ext3_listxattr,
2375 .removexattr = generic_removexattr,
2376#endif
2377 .permission = ext3_permission,
2378};
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
new file mode 100644
index 000000000000..2c9f81278d5d
--- /dev/null
+++ b/fs/ext3/resize.c
@@ -0,0 +1,996 @@
1/*
2 * linux/fs/ext3/resize.c
3 *
4 * Support for resizing an ext3 filesystem while it is mounted.
5 *
6 * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
7 *
8 * This could probably be made into a module, because it is not often in use.
9 */
10
11#include <linux/config.h>
12
13#define EXT3FS_DEBUG
14
15#include <linux/sched.h>
16#include <linux/smp_lock.h>
17#include <linux/ext3_jbd.h>
18
19#include <linux/errno.h>
20#include <linux/slab.h>
21
22
23#define outside(b, first, last) ((b) < (first) || (b) >= (last))
24#define inside(b, first, last) ((b) >= (first) && (b) < (last))
25
26static int verify_group_input(struct super_block *sb,
27 struct ext3_new_group_data *input)
28{
29 struct ext3_sb_info *sbi = EXT3_SB(sb);
30 struct ext3_super_block *es = sbi->s_es;
31 unsigned start = le32_to_cpu(es->s_blocks_count);
32 unsigned end = start + input->blocks_count;
33 unsigned group = input->group;
34 unsigned itend = input->inode_table + EXT3_SB(sb)->s_itb_per_group;
35 unsigned overhead = ext3_bg_has_super(sb, group) ?
36 (1 + ext3_bg_num_gdb(sb, group) +
37 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
38 unsigned metaend = start + overhead;
39 struct buffer_head *bh = NULL;
40 int free_blocks_count;
41 int err = -EINVAL;
42
43 input->free_blocks_count = free_blocks_count =
44 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
45
46 if (test_opt(sb, DEBUG))
47 printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks "
48 "(%d free, %u reserved)\n",
49 ext3_bg_has_super(sb, input->group) ? "normal" :
50 "no-super", input->group, input->blocks_count,
51 free_blocks_count, input->reserved_blocks);
52
53 if (group != sbi->s_groups_count)
54 ext3_warning(sb, __FUNCTION__,
55 "Cannot add at group %u (only %lu groups)",
56 input->group, sbi->s_groups_count);
57 else if ((start - le32_to_cpu(es->s_first_data_block)) %
58 EXT3_BLOCKS_PER_GROUP(sb))
59 ext3_warning(sb, __FUNCTION__, "Last group not full");
60 else if (input->reserved_blocks > input->blocks_count / 5)
61 ext3_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
62 input->reserved_blocks);
63 else if (free_blocks_count < 0)
64 ext3_warning(sb, __FUNCTION__, "Bad blocks count %u",
65 input->blocks_count);
66 else if (!(bh = sb_bread(sb, end - 1)))
67 ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)",
68 end - 1);
69 else if (outside(input->block_bitmap, start, end))
70 ext3_warning(sb, __FUNCTION__,
71 "Block bitmap not in group (block %u)",
72 input->block_bitmap);
73 else if (outside(input->inode_bitmap, start, end))
74 ext3_warning(sb, __FUNCTION__,
75 "Inode bitmap not in group (block %u)",
76 input->inode_bitmap);
77 else if (outside(input->inode_table, start, end) ||
78 outside(itend - 1, start, end))
79 ext3_warning(sb, __FUNCTION__,
80 "Inode table not in group (blocks %u-%u)",
81 input->inode_table, itend - 1);
82 else if (input->inode_bitmap == input->block_bitmap)
83 ext3_warning(sb, __FUNCTION__,
84 "Block bitmap same as inode bitmap (%u)",
85 input->block_bitmap);
86 else if (inside(input->block_bitmap, input->inode_table, itend))
87 ext3_warning(sb, __FUNCTION__,
88 "Block bitmap (%u) in inode table (%u-%u)",
89 input->block_bitmap, input->inode_table, itend-1);
90 else if (inside(input->inode_bitmap, input->inode_table, itend))
91 ext3_warning(sb, __FUNCTION__,
92 "Inode bitmap (%u) in inode table (%u-%u)",
93 input->inode_bitmap, input->inode_table, itend-1);
94 else if (inside(input->block_bitmap, start, metaend))
95 ext3_warning(sb, __FUNCTION__,
96 "Block bitmap (%u) in GDT table (%u-%u)",
97 input->block_bitmap, start, metaend - 1);
98 else if (inside(input->inode_bitmap, start, metaend))
99 ext3_warning(sb, __FUNCTION__,
100 "Inode bitmap (%u) in GDT table (%u-%u)",
101 input->inode_bitmap, start, metaend - 1);
102 else if (inside(input->inode_table, start, metaend) ||
103 inside(itend - 1, start, metaend))
104 ext3_warning(sb, __FUNCTION__,
105 "Inode table (%u-%u) overlaps GDT table (%u-%u)",
106 input->inode_table, itend - 1, start, metaend - 1);
107 else
108 err = 0;
109 brelse(bh);
110
111 return err;
112}
113
114static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
115 unsigned long blk)
116{
117 struct buffer_head *bh;
118 int err;
119
120 bh = sb_getblk(sb, blk);
121 if ((err = ext3_journal_get_write_access(handle, bh))) {
122 brelse(bh);
123 bh = ERR_PTR(err);
124 } else {
125 lock_buffer(bh);
126 memset(bh->b_data, 0, sb->s_blocksize);
127 set_buffer_uptodate(bh);
128 unlock_buffer(bh);
129 }
130
131 return bh;
132}
133
134/*
135 * To avoid calling the atomic setbit hundreds or thousands of times, we only
136 * need to use it within a single byte (to ensure we get endianness right).
137 * We can use memset for the rest of the bitmap as there are no other users.
138 */
139static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
140{
141 int i;
142
143 if (start_bit >= end_bit)
144 return;
145
146 ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
147 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
148 ext3_set_bit(i, bitmap);
149 if (i < end_bit)
150 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
151}
152
153/*
154 * Set up the block and inode bitmaps, and the inode table for the new group.
155 * This doesn't need to be part of the main transaction, since we are only
156 * changing blocks outside the actual filesystem. We still do journaling to
157 * ensure the recovery is correct in case of a failure just after resize.
158 * If any part of this fails, we simply abort the resize.
159 */
160static int setup_new_group_blocks(struct super_block *sb,
161 struct ext3_new_group_data *input)
162{
163 struct ext3_sb_info *sbi = EXT3_SB(sb);
164 unsigned long start = input->group * sbi->s_blocks_per_group +
165 le32_to_cpu(sbi->s_es->s_first_data_block);
166 int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
167 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
168 unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
169 struct buffer_head *bh;
170 handle_t *handle;
171 unsigned long block;
172 int bit;
173 int i;
174 int err = 0, err2;
175
176 handle = ext3_journal_start_sb(sb, reserved_gdb + gdblocks +
177 2 + sbi->s_itb_per_group);
178 if (IS_ERR(handle))
179 return PTR_ERR(handle);
180
181 lock_super(sb);
182 if (input->group != sbi->s_groups_count) {
183 err = -EBUSY;
184 goto exit_journal;
185 }
186
187 if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
188 err = PTR_ERR(bh);
189 goto exit_journal;
190 }
191
192 if (ext3_bg_has_super(sb, input->group)) {
193 ext3_debug("mark backup superblock %#04lx (+0)\n", start);
194 ext3_set_bit(0, bh->b_data);
195 }
196
197 /* Copy all of the GDT blocks into the backup in this group */
198 for (i = 0, bit = 1, block = start + 1;
199 i < gdblocks; i++, block++, bit++) {
200 struct buffer_head *gdb;
201
202 ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
203
204 gdb = sb_getblk(sb, block);
205 if ((err = ext3_journal_get_write_access(handle, gdb))) {
206 brelse(gdb);
207 goto exit_bh;
208 }
209 lock_buffer(bh);
210 memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size);
211 set_buffer_uptodate(gdb);
212 unlock_buffer(bh);
213 ext3_journal_dirty_metadata(handle, gdb);
214 ext3_set_bit(bit, bh->b_data);
215 brelse(gdb);
216 }
217
218 /* Zero out all of the reserved backup group descriptor table blocks */
219 for (i = 0, bit = gdblocks + 1, block = start + bit;
220 i < reserved_gdb; i++, block++, bit++) {
221 struct buffer_head *gdb;
222
223 ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit);
224
225 if (IS_ERR(gdb = bclean(handle, sb, block))) {
226 err = PTR_ERR(bh);
227 goto exit_bh;
228 }
229 ext3_journal_dirty_metadata(handle, gdb);
230 ext3_set_bit(bit, bh->b_data);
231 brelse(gdb);
232 }
233 ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
234 input->block_bitmap - start);
235 ext3_set_bit(input->block_bitmap - start, bh->b_data);
236 ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
237 input->inode_bitmap - start);
238 ext3_set_bit(input->inode_bitmap - start, bh->b_data);
239
240 /* Zero out all of the inode table blocks */
241 for (i = 0, block = input->inode_table, bit = block - start;
242 i < sbi->s_itb_per_group; i++, bit++, block++) {
243 struct buffer_head *it;
244
245 ext3_debug("clear inode block %#04x (+%ld)\n", block, bit);
246 if (IS_ERR(it = bclean(handle, sb, block))) {
247 err = PTR_ERR(it);
248 goto exit_bh;
249 }
250 ext3_journal_dirty_metadata(handle, it);
251 brelse(it);
252 ext3_set_bit(bit, bh->b_data);
253 }
254 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
255 bh->b_data);
256 ext3_journal_dirty_metadata(handle, bh);
257 brelse(bh);
258
259 /* Mark unused entries in inode bitmap used */
260 ext3_debug("clear inode bitmap %#04x (+%ld)\n",
261 input->inode_bitmap, input->inode_bitmap - start);
262 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
263 err = PTR_ERR(bh);
264 goto exit_journal;
265 }
266
267 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
268 bh->b_data);
269 ext3_journal_dirty_metadata(handle, bh);
270exit_bh:
271 brelse(bh);
272
273exit_journal:
274 unlock_super(sb);
275 if ((err2 = ext3_journal_stop(handle)) && !err)
276 err = err2;
277
278 return err;
279}
280
281/*
282 * Iterate through the groups which hold BACKUP superblock/GDT copies in an
283 * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before
284 * calling this for the first time. In a sparse filesystem it will be the
285 * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
286 * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
287 */
288static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
289 unsigned *five, unsigned *seven)
290{
291 unsigned *min = three;
292 int mult = 3;
293 unsigned ret;
294
295 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
296 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
297 ret = *min;
298 *min += 1;
299 return ret;
300 }
301
302 if (*five < *min) {
303 min = five;
304 mult = 5;
305 }
306 if (*seven < *min) {
307 min = seven;
308 mult = 7;
309 }
310
311 ret = *min;
312 *min *= mult;
313
314 return ret;
315}
316
317/*
318 * Check that all of the backup GDT blocks are held in the primary GDT block.
319 * It is assumed that they are stored in group order. Returns the number of
320 * groups in current filesystem that have BACKUPS, or -ve error code.
321 */
322static int verify_reserved_gdb(struct super_block *sb,
323 struct buffer_head *primary)
324{
325 const unsigned long blk = primary->b_blocknr;
326 const unsigned long end = EXT3_SB(sb)->s_groups_count;
327 unsigned three = 1;
328 unsigned five = 5;
329 unsigned seven = 7;
330 unsigned grp;
331 __u32 *p = (__u32 *)primary->b_data;
332 int gdbackups = 0;
333
334 while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
335 if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
336 ext3_warning(sb, __FUNCTION__,
337 "reserved GDT %ld missing grp %d (%ld)\n",
338 blk, grp,
339 grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
340 return -EINVAL;
341 }
342 if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb))
343 return -EFBIG;
344 }
345
346 return gdbackups;
347}
348
349/*
350 * Called when we need to bring a reserved group descriptor table block into
351 * use from the resize inode. The primary copy of the new GDT block currently
352 * is an indirect block (under the double indirect block in the resize inode).
353 * The new backup GDT blocks will be stored as leaf blocks in this indirect
354 * block, in group order. Even though we know all the block numbers we need,
355 * we check to ensure that the resize inode has actually reserved these blocks.
356 *
357 * Don't need to update the block bitmaps because the blocks are still in use.
358 *
359 * We get all of the error cases out of the way, so that we are sure to not
360 * fail once we start modifying the data on disk, because JBD has no rollback.
361 */
362static int add_new_gdb(handle_t *handle, struct inode *inode,
363 struct ext3_new_group_data *input,
364 struct buffer_head **primary)
365{
366 struct super_block *sb = inode->i_sb;
367 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
368 unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
369 unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
370 struct buffer_head **o_group_desc, **n_group_desc;
371 struct buffer_head *dind;
372 int gdbackups;
373 struct ext3_iloc iloc;
374 __u32 *data;
375 int err;
376
377 if (test_opt(sb, DEBUG))
378 printk(KERN_DEBUG
379 "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n",
380 gdb_num);
381
382 /*
383 * If we are not using the primary superblock/GDT copy don't resize,
384 * because the user tools have no way of handling this. Probably a
385 * bad time to do it anyways.
386 */
387 if (EXT3_SB(sb)->s_sbh->b_blocknr !=
388 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
389 ext3_warning(sb, __FUNCTION__,
390 "won't resize using backup superblock at %llu\n",
391 (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
392 return -EPERM;
393 }
394
395 *primary = sb_bread(sb, gdblock);
396 if (!*primary)
397 return -EIO;
398
399 if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
400 err = gdbackups;
401 goto exit_bh;
402 }
403
404 data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
405 dind = sb_bread(sb, le32_to_cpu(*data));
406 if (!dind) {
407 err = -EIO;
408 goto exit_bh;
409 }
410
411 data = (__u32 *)dind->b_data;
412 if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
413 ext3_warning(sb, __FUNCTION__,
414 "new group %u GDT block %lu not reserved\n",
415 input->group, gdblock);
416 err = -EINVAL;
417 goto exit_dind;
418 }
419
420 if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh)))
421 goto exit_dind;
422
423 if ((err = ext3_journal_get_write_access(handle, *primary)))
424 goto exit_sbh;
425
426 if ((err = ext3_journal_get_write_access(handle, dind)))
427 goto exit_primary;
428
429 /* ext3_reserve_inode_write() gets a reference on the iloc */
430 if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
431 goto exit_dindj;
432
433 n_group_desc = (struct buffer_head **)kmalloc((gdb_num + 1) *
434 sizeof(struct buffer_head *), GFP_KERNEL);
435 if (!n_group_desc) {
436 err = -ENOMEM;
437 ext3_warning (sb, __FUNCTION__,
438 "not enough memory for %lu groups", gdb_num + 1);
439 goto exit_inode;
440 }
441
442 /*
443 * Finally, we have all of the possible failures behind us...
444 *
445 * Remove new GDT block from inode double-indirect block and clear out
446 * the new GDT block for use (which also "frees" the backup GDT blocks
447 * from the reserved inode). We don't need to change the bitmaps for
448 * these blocks, because they are marked as in-use from being in the
449 * reserved inode, and will become GDT blocks (primary and backup).
450 */
451 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
452 ext3_journal_dirty_metadata(handle, dind);
453 brelse(dind);
454 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
455 ext3_mark_iloc_dirty(handle, inode, &iloc);
456 memset((*primary)->b_data, 0, sb->s_blocksize);
457 ext3_journal_dirty_metadata(handle, *primary);
458
459 o_group_desc = EXT3_SB(sb)->s_group_desc;
460 memcpy(n_group_desc, o_group_desc,
461 EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
462 n_group_desc[gdb_num] = *primary;
463 EXT3_SB(sb)->s_group_desc = n_group_desc;
464 EXT3_SB(sb)->s_gdb_count++;
465 kfree(o_group_desc);
466
467 es->s_reserved_gdt_blocks =
468 cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
469 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
470
471 return 0;
472
473exit_inode:
474 //ext3_journal_release_buffer(handle, iloc.bh);
475 brelse(iloc.bh);
476exit_dindj:
477 //ext3_journal_release_buffer(handle, dind);
478exit_primary:
479 //ext3_journal_release_buffer(handle, *primary);
480exit_sbh:
481 //ext3_journal_release_buffer(handle, *primary);
482exit_dind:
483 brelse(dind);
484exit_bh:
485 brelse(*primary);
486
487 ext3_debug("leaving with error %d\n", err);
488 return err;
489}
490
491/*
492 * Called when we are adding a new group which has a backup copy of each of
493 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
494 * We need to add these reserved backup GDT blocks to the resize inode, so
495 * that they are kept for future resizing and not allocated to files.
496 *
497 * Each reserved backup GDT block will go into a different indirect block.
498 * The indirect blocks are actually the primary reserved GDT blocks,
499 * so we know in advance what their block numbers are. We only get the
500 * double-indirect block to verify it is pointing to the primary reserved
501 * GDT blocks so we don't overwrite a data block by accident. The reserved
502 * backup GDT blocks are stored in their reserved primary GDT block.
503 */
504static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
505 struct ext3_new_group_data *input)
506{
507 struct super_block *sb = inode->i_sb;
508 int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks);
509 struct buffer_head **primary;
510 struct buffer_head *dind;
511 struct ext3_iloc iloc;
512 unsigned long blk;
513 __u32 *data, *end;
514 int gdbackups = 0;
515 int res, i;
516 int err;
517
518 primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
519 if (!primary)
520 return -ENOMEM;
521
522 data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
523 dind = sb_bread(sb, le32_to_cpu(*data));
524 if (!dind) {
525 err = -EIO;
526 goto exit_free;
527 }
528
529 blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
530 data = (__u32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
531 end = (__u32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
532
533 /* Get each reserved primary GDT block and verify it holds backups */
534 for (res = 0; res < reserved_gdb; res++, blk++) {
535 if (le32_to_cpu(*data) != blk) {
536 ext3_warning(sb, __FUNCTION__,
537 "reserved block %lu not at offset %ld\n",
538 blk, (long)(data - (__u32 *)dind->b_data));
539 err = -EINVAL;
540 goto exit_bh;
541 }
542 primary[res] = sb_bread(sb, blk);
543 if (!primary[res]) {
544 err = -EIO;
545 goto exit_bh;
546 }
547 if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
548 brelse(primary[res]);
549 err = gdbackups;
550 goto exit_bh;
551 }
552 if (++data >= end)
553 data = (__u32 *)dind->b_data;
554 }
555
556 for (i = 0; i < reserved_gdb; i++) {
557 if ((err = ext3_journal_get_write_access(handle, primary[i]))) {
558 /*
559 int j;
560 for (j = 0; j < i; j++)
561 ext3_journal_release_buffer(handle, primary[j]);
562 */
563 goto exit_bh;
564 }
565 }
566
567 if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
568 goto exit_bh;
569
570 /*
571 * Finally we can add each of the reserved backup GDT blocks from
572 * the new group to its reserved primary GDT block.
573 */
574 blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
575 for (i = 0; i < reserved_gdb; i++) {
576 int err2;
577 data = (__u32 *)primary[i]->b_data;
578 /* printk("reserving backup %lu[%u] = %lu\n",
579 primary[i]->b_blocknr, gdbackups,
580 blk + primary[i]->b_blocknr); */
581 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
582 err2 = ext3_journal_dirty_metadata(handle, primary[i]);
583 if (!err)
584 err = err2;
585 }
586 inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
587 ext3_mark_iloc_dirty(handle, inode, &iloc);
588
589exit_bh:
590 while (--res >= 0)
591 brelse(primary[res]);
592 brelse(dind);
593
594exit_free:
595 kfree(primary);
596
597 return err;
598}
599
600/*
601 * Update the backup copies of the ext3 metadata. These don't need to be part
602 * of the main resize transaction, because e2fsck will re-write them if there
603 * is a problem (basically only OOM will cause a problem). However, we
604 * _should_ update the backups if possible, in case the primary gets trashed
605 * for some reason and we need to run e2fsck from a backup superblock. The
606 * important part is that the new block and inode counts are in the backup
607 * superblocks, and the location of the new group metadata in the GDT backups.
608 *
609 * We do not need lock_super() for this, because these blocks are not
610 * otherwise touched by the filesystem code when it is mounted. We don't
611 * need to worry about last changing from sbi->s_groups_count, because the
612 * worst that can happen is that we do not copy the full number of backups
613 * at this time. The resize which changed s_groups_count will backup again.
614 */
615static void update_backups(struct super_block *sb,
616 int blk_off, char *data, int size)
617{
618 struct ext3_sb_info *sbi = EXT3_SB(sb);
619 const unsigned long last = sbi->s_groups_count;
620 const int bpg = EXT3_BLOCKS_PER_GROUP(sb);
621 unsigned three = 1;
622 unsigned five = 5;
623 unsigned seven = 7;
624 unsigned group;
625 int rest = sb->s_blocksize - size;
626 handle_t *handle;
627 int err = 0, err2;
628
629 handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
630 if (IS_ERR(handle)) {
631 group = 1;
632 err = PTR_ERR(handle);
633 goto exit_err;
634 }
635
636 while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) {
637 struct buffer_head *bh;
638
639 /* Out of journal space, and can't get more - abort - so sad */
640 if (handle->h_buffer_credits == 0 &&
641 ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) &&
642 (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA)))
643 break;
644
645 bh = sb_getblk(sb, group * bpg + blk_off);
646 ext3_debug(sb, __FUNCTION__, "update metadata backup %#04lx\n",
647 bh->b_blocknr);
648 if ((err = ext3_journal_get_write_access(handle, bh)))
649 break;
650 lock_buffer(bh);
651 memcpy(bh->b_data, data, size);
652 if (rest)
653 memset(bh->b_data + size, 0, rest);
654 set_buffer_uptodate(bh);
655 unlock_buffer(bh);
656 ext3_journal_dirty_metadata(handle, bh);
657 brelse(bh);
658 }
659 if ((err2 = ext3_journal_stop(handle)) && !err)
660 err = err2;
661
662 /*
663 * Ugh! Need to have e2fsck write the backup copies. It is too
664 * late to revert the resize, we shouldn't fail just because of
665 * the backup copies (they are only needed in case of corruption).
666 *
667 * However, if we got here we have a journal problem too, so we
668 * can't really start a transaction to mark the superblock.
669 * Chicken out and just set the flag on the hope it will be written
670 * to disk, and if not - we will simply wait until next fsck.
671 */
672exit_err:
673 if (err) {
674 ext3_warning(sb, __FUNCTION__,
675 "can't update backup for group %d (err %d), "
676 "forcing fsck on next reboot\n", group, err);
677 sbi->s_mount_state &= ~EXT3_VALID_FS;
678 sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS);
679 mark_buffer_dirty(sbi->s_sbh);
680 }
681}
682
683/* Add group descriptor data to an existing or new group descriptor block.
684 * Ensure we handle all possible error conditions _before_ we start modifying
685 * the filesystem, because we cannot abort the transaction and not have it
686 * write the data to disk.
687 *
688 * If we are on a GDT block boundary, we need to get the reserved GDT block.
689 * Otherwise, we may need to add backup GDT blocks for a sparse group.
690 *
691 * We only need to hold the superblock lock while we are actually adding
692 * in the new group's counts to the superblock. Prior to that we have
693 * not really "added" the group at all. We re-check that we are still
694 * adding in the last group in case things have changed since verifying.
695 */
696int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
697{
698 struct ext3_sb_info *sbi = EXT3_SB(sb);
699 struct ext3_super_block *es = sbi->s_es;
700 int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
701 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
702 struct buffer_head *primary = NULL;
703 struct ext3_group_desc *gdp;
704 struct inode *inode = NULL;
705 handle_t *handle;
706 int gdb_off, gdb_num;
707 int err, err2;
708
709 gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
710 gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb);
711
712 if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
713 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
714 ext3_warning(sb, __FUNCTION__,
715 "Can't resize non-sparse filesystem further\n");
716 return -EPERM;
717 }
718
719 if (reserved_gdb || gdb_off == 0) {
720 if (!EXT3_HAS_COMPAT_FEATURE(sb,
721 EXT3_FEATURE_COMPAT_RESIZE_INODE)){
722 ext3_warning(sb, __FUNCTION__,
723 "No reserved GDT blocks, can't resize\n");
724 return -EPERM;
725 }
726 inode = iget(sb, EXT3_RESIZE_INO);
727 if (!inode || is_bad_inode(inode)) {
728 ext3_warning(sb, __FUNCTION__,
729 "Error opening resize inode\n");
730 iput(inode);
731 return -ENOENT;
732 }
733 }
734
735 if ((err = verify_group_input(sb, input)))
736 goto exit_put;
737
738 if ((err = setup_new_group_blocks(sb, input)))
739 goto exit_put;
740
741 /*
742 * We will always be modifying at least the superblock and a GDT
743 * block. If we are adding a group past the last current GDT block,
744 * we will also modify the inode and the dindirect block. If we
745 * are adding a group with superblock/GDT backups we will also
746 * modify each of the reserved GDT dindirect blocks.
747 */
748 handle = ext3_journal_start_sb(sb,
749 ext3_bg_has_super(sb, input->group) ?
750 3 + reserved_gdb : 4);
751 if (IS_ERR(handle)) {
752 err = PTR_ERR(handle);
753 goto exit_put;
754 }
755
756 lock_super(sb);
757 if (input->group != EXT3_SB(sb)->s_groups_count) {
758 ext3_warning(sb, __FUNCTION__,
759 "multiple resizers run on filesystem!\n");
760 goto exit_journal;
761 }
762
763 if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh)))
764 goto exit_journal;
765
766 /*
767 * We will only either add reserved group blocks to a backup group
768 * or remove reserved blocks for the first group in a new group block.
769 * Doing both would be mean more complex code, and sane people don't
770 * use non-sparse filesystems anymore. This is already checked above.
771 */
772 if (gdb_off) {
773 primary = sbi->s_group_desc[gdb_num];
774 if ((err = ext3_journal_get_write_access(handle, primary)))
775 goto exit_journal;
776
777 if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) &&
778 (err = reserve_backup_gdb(handle, inode, input)))
779 goto exit_journal;
780 } else if ((err = add_new_gdb(handle, inode, input, &primary)))
781 goto exit_journal;
782
783 /*
784 * OK, now we've set up the new group. Time to make it active.
785 *
786 * Current kernels don't lock all allocations via lock_super(),
787 * so we have to be safe wrt. concurrent accesses the group
788 * data. So we need to be careful to set all of the relevant
789 * group descriptor data etc. *before* we enable the group.
790 *
791 * The key field here is EXT3_SB(sb)->s_groups_count: as long as
792 * that retains its old value, nobody is going to access the new
793 * group.
794 *
795 * So first we update all the descriptor metadata for the new
796 * group; then we update the total disk blocks count; then we
797 * update the groups count to enable the group; then finally we
798 * update the free space counts so that the system can start
799 * using the new disk blocks.
800 */
801
802 /* Update group descriptor block for new group */
803 gdp = (struct ext3_group_desc *)primary->b_data + gdb_off;
804
805 gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap);
806 gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap);
807 gdp->bg_inode_table = cpu_to_le32(input->inode_table);
808 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
809 gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb));
810
811 /*
812 * Make the new blocks and inodes valid next. We do this before
813 * increasing the group count so that once the group is enabled,
814 * all of its blocks and inodes are already valid.
815 *
816 * We always allocate group-by-group, then block-by-block or
817 * inode-by-inode within a group, so enabling these
818 * blocks/inodes before the group is live won't actually let us
819 * allocate the new space yet.
820 */
821 es->s_blocks_count = cpu_to_le32(le32_to_cpu(es->s_blocks_count) +
822 input->blocks_count);
823 es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
824 EXT3_INODES_PER_GROUP(sb));
825
826 /*
827 * We need to protect s_groups_count against other CPUs seeing
828 * inconsistent state in the superblock.
829 *
830 * The precise rules we use are:
831 *
832 * * Writers of s_groups_count *must* hold lock_super
833 * AND
834 * * Writers must perform a smp_wmb() after updating all dependent
835 * data and before modifying the groups count
836 *
837 * * Readers must hold lock_super() over the access
838 * OR
839 * * Readers must perform an smp_rmb() after reading the groups count
840 * and before reading any dependent data.
841 *
842 * NB. These rules can be relaxed when checking the group count
843 * while freeing data, as we can only allocate from a block
844 * group after serialising against the group count, and we can
845 * only then free after serialising in turn against that
846 * allocation.
847 */
848 smp_wmb();
849
850 /* Update the global fs size fields */
851 EXT3_SB(sb)->s_groups_count++;
852
853 ext3_journal_dirty_metadata(handle, primary);
854
855 /* Update the reserved block counts only once the new group is
856 * active. */
857 es->s_r_blocks_count = cpu_to_le32(le32_to_cpu(es->s_r_blocks_count) +
858 input->reserved_blocks);
859
860 /* Update the free space counts */
861 percpu_counter_mod(&sbi->s_freeblocks_counter,
862 input->free_blocks_count);
863 percpu_counter_mod(&sbi->s_freeinodes_counter,
864 EXT3_INODES_PER_GROUP(sb));
865
866 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
867 sb->s_dirt = 1;
868
869exit_journal:
870 unlock_super(sb);
871 if ((err2 = ext3_journal_stop(handle)) && !err)
872 err = err2;
873 if (!err) {
874 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
875 sizeof(struct ext3_super_block));
876 update_backups(sb, primary->b_blocknr, primary->b_data,
877 primary->b_size);
878 }
879exit_put:
880 iput(inode);
881 return err;
882} /* ext3_group_add */
883
884/* Extend the filesystem to the new number of blocks specified. This entry
885 * point is only used to extend the current filesystem to the end of the last
886 * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
887 * for emergencies (because it has no dependencies on reserved blocks).
888 *
889 * If we _really_ wanted, we could use default values to call ext3_group_add()
890 * allow the "remount" trick to work for arbitrary resizing, assuming enough
891 * GDT blocks are reserved to grow to the desired size.
892 */
893int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
894 unsigned long n_blocks_count)
895{
896 unsigned long o_blocks_count;
897 unsigned long o_groups_count;
898 unsigned long last;
899 int add;
900 struct buffer_head * bh;
901 handle_t *handle;
902 int err, freed_blocks;
903
904 /* We don't need to worry about locking wrt other resizers just
905 * yet: we're going to revalidate es->s_blocks_count after
906 * taking lock_super() below. */
907 o_blocks_count = le32_to_cpu(es->s_blocks_count);
908 o_groups_count = EXT3_SB(sb)->s_groups_count;
909
910 if (test_opt(sb, DEBUG))
911 printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n",
912 o_blocks_count, n_blocks_count);
913
914 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
915 return 0;
916
917 if (n_blocks_count < o_blocks_count) {
918 ext3_warning(sb, __FUNCTION__,
919 "can't shrink FS - resize aborted");
920 return -EBUSY;
921 }
922
923 /* Handle the remaining blocks in the last group only. */
924 last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) %
925 EXT3_BLOCKS_PER_GROUP(sb);
926
927 if (last == 0) {
928 ext3_warning(sb, __FUNCTION__,
929 "need to use ext2online to resize further\n");
930 return -EPERM;
931 }
932
933 add = EXT3_BLOCKS_PER_GROUP(sb) - last;
934
935 if (o_blocks_count + add > n_blocks_count)
936 add = n_blocks_count - o_blocks_count;
937
938 if (o_blocks_count + add < n_blocks_count)
939 ext3_warning(sb, __FUNCTION__,
940 "will only finish group (%lu blocks, %u new)",
941 o_blocks_count + add, add);
942
943 /* See if the device is actually as big as what was requested */
944 bh = sb_bread(sb, o_blocks_count + add -1);
945 if (!bh) {
946 ext3_warning(sb, __FUNCTION__,
947 "can't read last block, resize aborted");
948 return -ENOSPC;
949 }
950 brelse(bh);
951
952 /* We will update the superblock, one block bitmap, and
953 * one group descriptor via ext3_free_blocks().
954 */
955 handle = ext3_journal_start_sb(sb, 3);
956 if (IS_ERR(handle)) {
957 err = PTR_ERR(handle);
958 ext3_warning(sb, __FUNCTION__, "error %d on journal start",err);
959 goto exit_put;
960 }
961
962 lock_super(sb);
963 if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
964 ext3_warning(sb, __FUNCTION__,
965 "multiple resizers run on filesystem!\n");
966 err = -EBUSY;
967 goto exit_put;
968 }
969
970 if ((err = ext3_journal_get_write_access(handle,
971 EXT3_SB(sb)->s_sbh))) {
972 ext3_warning(sb, __FUNCTION__,
973 "error %d on journal write access", err);
974 unlock_super(sb);
975 ext3_journal_stop(handle);
976 goto exit_put;
977 }
978 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
979 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
980 sb->s_dirt = 1;
981 unlock_super(sb);
982 ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count,
983 o_blocks_count + add);
984 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
985 ext3_debug("freed blocks %ld through %ld\n", o_blocks_count,
986 o_blocks_count + add);
987 if ((err = ext3_journal_stop(handle)))
988 goto exit_put;
989 if (test_opt(sb, DEBUG))
990 printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n",
991 le32_to_cpu(es->s_blocks_count));
992 update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es,
993 sizeof(struct ext3_super_block));
994exit_put:
995 return err;
996} /* ext3_group_extend */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
new file mode 100644
index 000000000000..545b440a2d2f
--- /dev/null
+++ b/fs/ext3/super.c
@@ -0,0 +1,2539 @@
1/*
2 * linux/fs/ext3/super.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 */
18
19#include <linux/config.h>
20#include <linux/module.h>
21#include <linux/string.h>
22#include <linux/fs.h>
23#include <linux/time.h>
24#include <linux/jbd.h>
25#include <linux/ext3_fs.h>
26#include <linux/ext3_jbd.h>
27#include <linux/slab.h>
28#include <linux/init.h>
29#include <linux/blkdev.h>
30#include <linux/parser.h>
31#include <linux/smp_lock.h>
32#include <linux/buffer_head.h>
33#include <linux/vfs.h>
34#include <linux/random.h>
35#include <linux/mount.h>
36#include <linux/namei.h>
37#include <linux/quotaops.h>
38#include <asm/uaccess.h>
39#include "xattr.h"
40#include "acl.h"
41
42static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
43static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
44 int);
45static void ext3_commit_super (struct super_block * sb,
46 struct ext3_super_block * es,
47 int sync);
48static void ext3_mark_recovery_complete(struct super_block * sb,
49 struct ext3_super_block * es);
50static void ext3_clear_journal_err(struct super_block * sb,
51 struct ext3_super_block * es);
52static int ext3_sync_fs(struct super_block *sb, int wait);
53static const char *ext3_decode_error(struct super_block * sb, int errno,
54 char nbuf[16]);
55static int ext3_remount (struct super_block * sb, int * flags, char * data);
56static int ext3_statfs (struct super_block * sb, struct kstatfs * buf);
57static void ext3_unlockfs(struct super_block *sb);
58static void ext3_write_super (struct super_block * sb);
59static void ext3_write_super_lockfs(struct super_block *sb);
60
61/*
62 * Wrappers for journal_start/end.
63 *
64 * The only special thing we need to do here is to make sure that all
65 * journal_end calls result in the superblock being marked dirty, so
66 * that sync() will call the filesystem's write_super callback if
67 * appropriate.
68 */
69handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
70{
71 journal_t *journal;
72
73 if (sb->s_flags & MS_RDONLY)
74 return ERR_PTR(-EROFS);
75
76 /* Special case here: if the journal has aborted behind our
77 * backs (eg. EIO in the commit thread), then we still need to
78 * take the FS itself readonly cleanly. */
79 journal = EXT3_SB(sb)->s_journal;
80 if (is_journal_aborted(journal)) {
81 ext3_abort(sb, __FUNCTION__,
82 "Detected aborted journal");
83 return ERR_PTR(-EROFS);
84 }
85
86 return journal_start(journal, nblocks);
87}
88
89/*
90 * The only special thing we need to do here is to make sure that all
91 * journal_stop calls result in the superblock being marked dirty, so
92 * that sync() will call the filesystem's write_super callback if
93 * appropriate.
94 */
95int __ext3_journal_stop(const char *where, handle_t *handle)
96{
97 struct super_block *sb;
98 int err;
99 int rc;
100
101 sb = handle->h_transaction->t_journal->j_private;
102 err = handle->h_err;
103 rc = journal_stop(handle);
104
105 if (!err)
106 err = rc;
107 if (err)
108 __ext3_std_error(sb, where, err);
109 return err;
110}
111
112void ext3_journal_abort_handle(const char *caller, const char *err_fn,
113 struct buffer_head *bh, handle_t *handle, int err)
114{
115 char nbuf[16];
116 const char *errstr = ext3_decode_error(NULL, err, nbuf);
117
118 if (bh)
119 BUFFER_TRACE(bh, "abort");
120
121 if (!handle->h_err)
122 handle->h_err = err;
123
124 if (is_handle_aborted(handle))
125 return;
126
127 printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
128 caller, errstr, err_fn);
129
130 journal_abort_handle(handle);
131}
132
133/* Deal with the reporting of failure conditions on a filesystem such as
134 * inconsistencies detected or read IO failures.
135 *
136 * On ext2, we can store the error state of the filesystem in the
137 * superblock. That is not possible on ext3, because we may have other
138 * write ordering constraints on the superblock which prevent us from
139 * writing it out straight away; and given that the journal is about to
140 * be aborted, we can't rely on the current, or future, transactions to
141 * write out the superblock safely.
142 *
143 * We'll just use the journal_abort() error code to record an error in
144 * the journal instead. On recovery, the journal will compain about
145 * that error until we've noted it down and cleared it.
146 */
147
148static void ext3_handle_error(struct super_block *sb)
149{
150 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
151
152 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
153 es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
154
155 if (sb->s_flags & MS_RDONLY)
156 return;
157
158 if (test_opt (sb, ERRORS_RO)) {
159 printk (KERN_CRIT "Remounting filesystem read-only\n");
160 sb->s_flags |= MS_RDONLY;
161 } else {
162 journal_t *journal = EXT3_SB(sb)->s_journal;
163
164 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
165 if (journal)
166 journal_abort(journal, -EIO);
167 }
168 if (test_opt(sb, ERRORS_PANIC))
169 panic("EXT3-fs (device %s): panic forced after error\n",
170 sb->s_id);
171 ext3_commit_super(sb, es, 1);
172}
173
174void ext3_error (struct super_block * sb, const char * function,
175 const char * fmt, ...)
176{
177 va_list args;
178
179 va_start(args, fmt);
180 printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
181 vprintk(fmt, args);
182 printk("\n");
183 va_end(args);
184
185 ext3_handle_error(sb);
186}
187
188static const char *ext3_decode_error(struct super_block * sb, int errno,
189 char nbuf[16])
190{
191 char *errstr = NULL;
192
193 switch (errno) {
194 case -EIO:
195 errstr = "IO failure";
196 break;
197 case -ENOMEM:
198 errstr = "Out of memory";
199 break;
200 case -EROFS:
201 if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
202 errstr = "Journal has aborted";
203 else
204 errstr = "Readonly filesystem";
205 break;
206 default:
207 /* If the caller passed in an extra buffer for unknown
208 * errors, textualise them now. Else we just return
209 * NULL. */
210 if (nbuf) {
211 /* Check for truncated error codes... */
212 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
213 errstr = nbuf;
214 }
215 break;
216 }
217
218 return errstr;
219}
220
221/* __ext3_std_error decodes expected errors from journaling functions
222 * automatically and invokes the appropriate error response. */
223
224void __ext3_std_error (struct super_block * sb, const char * function,
225 int errno)
226{
227 char nbuf[16];
228 const char *errstr = ext3_decode_error(sb, errno, nbuf);
229
230 printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
231 sb->s_id, function, errstr);
232
233 ext3_handle_error(sb);
234}
235
236/*
237 * ext3_abort is a much stronger failure handler than ext3_error. The
238 * abort function may be used to deal with unrecoverable failures such
239 * as journal IO errors or ENOMEM at a critical moment in log management.
240 *
241 * We unconditionally force the filesystem into an ABORT|READONLY state,
242 * unless the error response on the fs has been set to panic in which
243 * case we take the easy way out and panic immediately.
244 */
245
246void ext3_abort (struct super_block * sb, const char * function,
247 const char * fmt, ...)
248{
249 va_list args;
250
251 printk (KERN_CRIT "ext3_abort called.\n");
252
253 va_start(args, fmt);
254 printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
255 vprintk(fmt, args);
256 printk("\n");
257 va_end(args);
258
259 if (test_opt(sb, ERRORS_PANIC))
260 panic("EXT3-fs panic from previous error\n");
261
262 if (sb->s_flags & MS_RDONLY)
263 return;
264
265 printk(KERN_CRIT "Remounting filesystem read-only\n");
266 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
267 sb->s_flags |= MS_RDONLY;
268 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
269 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
270}
271
272void ext3_warning (struct super_block * sb, const char * function,
273 const char * fmt, ...)
274{
275 va_list args;
276
277 va_start(args, fmt);
278 printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ",
279 sb->s_id, function);
280 vprintk(fmt, args);
281 printk("\n");
282 va_end(args);
283}
284
285void ext3_update_dynamic_rev(struct super_block *sb)
286{
287 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
288
289 if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
290 return;
291
292 ext3_warning(sb, __FUNCTION__,
293 "updating to rev %d because of new feature flag, "
294 "running e2fsck is recommended",
295 EXT3_DYNAMIC_REV);
296
297 es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
298 es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
299 es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
300 /* leave es->s_feature_*compat flags alone */
301 /* es->s_uuid will be set by e2fsck if empty */
302
303 /*
304 * The rest of the superblock fields should be zero, and if not it
305 * means they are likely already in use, so leave them alone. We
306 * can leave it up to e2fsck to clean up any inconsistencies there.
307 */
308}
309
310/*
311 * Open the external journal device
312 */
313static struct block_device *ext3_blkdev_get(dev_t dev)
314{
315 struct block_device *bdev;
316 char b[BDEVNAME_SIZE];
317
318 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
319 if (IS_ERR(bdev))
320 goto fail;
321 return bdev;
322
323fail:
324 printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n",
325 __bdevname(dev, b), PTR_ERR(bdev));
326 return NULL;
327}
328
329/*
330 * Release the journal device
331 */
332static int ext3_blkdev_put(struct block_device *bdev)
333{
334 bd_release(bdev);
335 return blkdev_put(bdev);
336}
337
338static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
339{
340 struct block_device *bdev;
341 int ret = -ENODEV;
342
343 bdev = sbi->journal_bdev;
344 if (bdev) {
345 ret = ext3_blkdev_put(bdev);
346 sbi->journal_bdev = NULL;
347 }
348 return ret;
349}
350
351static inline struct inode *orphan_list_entry(struct list_head *l)
352{
353 return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode;
354}
355
356static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
357{
358 struct list_head *l;
359
360 printk(KERN_ERR "sb orphan head is %d\n",
361 le32_to_cpu(sbi->s_es->s_last_orphan));
362
363 printk(KERN_ERR "sb_info orphan list:\n");
364 list_for_each(l, &sbi->s_orphan) {
365 struct inode *inode = orphan_list_entry(l);
366 printk(KERN_ERR " "
367 "inode %s:%ld at %p: mode %o, nlink %d, next %d\n",
368 inode->i_sb->s_id, inode->i_ino, inode,
369 inode->i_mode, inode->i_nlink,
370 NEXT_ORPHAN(inode));
371 }
372}
373
374static void ext3_put_super (struct super_block * sb)
375{
376 struct ext3_sb_info *sbi = EXT3_SB(sb);
377 struct ext3_super_block *es = sbi->s_es;
378 int i;
379
380 ext3_xattr_put_super(sb);
381 journal_destroy(sbi->s_journal);
382 if (!(sb->s_flags & MS_RDONLY)) {
383 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
384 es->s_state = cpu_to_le16(sbi->s_mount_state);
385 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
386 mark_buffer_dirty(sbi->s_sbh);
387 ext3_commit_super(sb, es, 1);
388 }
389
390 for (i = 0; i < sbi->s_gdb_count; i++)
391 brelse(sbi->s_group_desc[i]);
392 kfree(sbi->s_group_desc);
393 percpu_counter_destroy(&sbi->s_freeblocks_counter);
394 percpu_counter_destroy(&sbi->s_freeinodes_counter);
395 percpu_counter_destroy(&sbi->s_dirs_counter);
396 brelse(sbi->s_sbh);
397#ifdef CONFIG_QUOTA
398 for (i = 0; i < MAXQUOTAS; i++)
399 kfree(sbi->s_qf_names[i]);
400#endif
401
402 /* Debugging code just in case the in-memory inode orphan list
403 * isn't empty. The on-disk one can be non-empty if we've
404 * detected an error and taken the fs readonly, but the
405 * in-memory list had better be clean by this point. */
406 if (!list_empty(&sbi->s_orphan))
407 dump_orphan_list(sb, sbi);
408 J_ASSERT(list_empty(&sbi->s_orphan));
409
410 invalidate_bdev(sb->s_bdev, 0);
411 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
412 /*
413 * Invalidate the journal device's buffers. We don't want them
414 * floating about in memory - the physical journal device may
415 * hotswapped, and it breaks the `ro-after' testing code.
416 */
417 sync_blockdev(sbi->journal_bdev);
418 invalidate_bdev(sbi->journal_bdev, 0);
419 ext3_blkdev_remove(sbi);
420 }
421 sb->s_fs_info = NULL;
422 kfree(sbi);
423 return;
424}
425
426static kmem_cache_t *ext3_inode_cachep;
427
428/*
429 * Called inside transaction, so use GFP_NOFS
430 */
431static struct inode *ext3_alloc_inode(struct super_block *sb)
432{
433 struct ext3_inode_info *ei;
434
435 ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
436 if (!ei)
437 return NULL;
438#ifdef CONFIG_EXT3_FS_POSIX_ACL
439 ei->i_acl = EXT3_ACL_NOT_CACHED;
440 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
441#endif
442 ei->i_block_alloc_info = NULL;
443 ei->vfs_inode.i_version = 1;
444 return &ei->vfs_inode;
445}
446
447static void ext3_destroy_inode(struct inode *inode)
448{
449 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
450}
451
452static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
453{
454 struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
455
456 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
457 SLAB_CTOR_CONSTRUCTOR) {
458 INIT_LIST_HEAD(&ei->i_orphan);
459#ifdef CONFIG_EXT3_FS_XATTR
460 init_rwsem(&ei->xattr_sem);
461#endif
462 init_MUTEX(&ei->truncate_sem);
463 inode_init_once(&ei->vfs_inode);
464 }
465}
466
467static int init_inodecache(void)
468{
469 ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
470 sizeof(struct ext3_inode_info),
471 0, SLAB_RECLAIM_ACCOUNT,
472 init_once, NULL);
473 if (ext3_inode_cachep == NULL)
474 return -ENOMEM;
475 return 0;
476}
477
478static void destroy_inodecache(void)
479{
480 if (kmem_cache_destroy(ext3_inode_cachep))
481 printk(KERN_INFO "ext3_inode_cache: not all structures were freed\n");
482}
483
484static void ext3_clear_inode(struct inode *inode)
485{
486 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
487#ifdef CONFIG_EXT3_FS_POSIX_ACL
488 if (EXT3_I(inode)->i_acl &&
489 EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
490 posix_acl_release(EXT3_I(inode)->i_acl);
491 EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
492 }
493 if (EXT3_I(inode)->i_default_acl &&
494 EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
495 posix_acl_release(EXT3_I(inode)->i_default_acl);
496 EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
497 }
498#endif
499 ext3_discard_reservation(inode);
500 EXT3_I(inode)->i_block_alloc_info = NULL;
501 kfree(rsv);
502}
503
504#ifdef CONFIG_QUOTA
505
506#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
507#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
508
509static int ext3_dquot_initialize(struct inode *inode, int type);
510static int ext3_dquot_drop(struct inode *inode);
511static int ext3_write_dquot(struct dquot *dquot);
512static int ext3_acquire_dquot(struct dquot *dquot);
513static int ext3_release_dquot(struct dquot *dquot);
514static int ext3_mark_dquot_dirty(struct dquot *dquot);
515static int ext3_write_info(struct super_block *sb, int type);
516static int ext3_quota_on(struct super_block *sb, int type, int format_id, char *path);
517static int ext3_quota_on_mount(struct super_block *sb, int type);
518static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
519 size_t len, loff_t off);
520static ssize_t ext3_quota_write(struct super_block *sb, int type,
521 const char *data, size_t len, loff_t off);
522
523static struct dquot_operations ext3_quota_operations = {
524 .initialize = ext3_dquot_initialize,
525 .drop = ext3_dquot_drop,
526 .alloc_space = dquot_alloc_space,
527 .alloc_inode = dquot_alloc_inode,
528 .free_space = dquot_free_space,
529 .free_inode = dquot_free_inode,
530 .transfer = dquot_transfer,
531 .write_dquot = ext3_write_dquot,
532 .acquire_dquot = ext3_acquire_dquot,
533 .release_dquot = ext3_release_dquot,
534 .mark_dirty = ext3_mark_dquot_dirty,
535 .write_info = ext3_write_info
536};
537
538static struct quotactl_ops ext3_qctl_operations = {
539 .quota_on = ext3_quota_on,
540 .quota_off = vfs_quota_off,
541 .quota_sync = vfs_quota_sync,
542 .get_info = vfs_get_dqinfo,
543 .set_info = vfs_set_dqinfo,
544 .get_dqblk = vfs_get_dqblk,
545 .set_dqblk = vfs_set_dqblk
546};
547#endif
548
549static struct super_operations ext3_sops = {
550 .alloc_inode = ext3_alloc_inode,
551 .destroy_inode = ext3_destroy_inode,
552 .read_inode = ext3_read_inode,
553 .write_inode = ext3_write_inode,
554 .dirty_inode = ext3_dirty_inode,
555 .delete_inode = ext3_delete_inode,
556 .put_super = ext3_put_super,
557 .write_super = ext3_write_super,
558 .sync_fs = ext3_sync_fs,
559 .write_super_lockfs = ext3_write_super_lockfs,
560 .unlockfs = ext3_unlockfs,
561 .statfs = ext3_statfs,
562 .remount_fs = ext3_remount,
563 .clear_inode = ext3_clear_inode,
564#ifdef CONFIG_QUOTA
565 .quota_read = ext3_quota_read,
566 .quota_write = ext3_quota_write,
567#endif
568};
569
570struct dentry *ext3_get_parent(struct dentry *child);
571static struct export_operations ext3_export_ops = {
572 .get_parent = ext3_get_parent,
573};
574
575enum {
576 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
577 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
578 Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
579 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
580 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
581 Opt_commit, Opt_journal_update, Opt_journal_inum,
582 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
583 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
584 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
585 Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
586};
587
588static match_table_t tokens = {
589 {Opt_bsd_df, "bsddf"},
590 {Opt_minix_df, "minixdf"},
591 {Opt_grpid, "grpid"},
592 {Opt_grpid, "bsdgroups"},
593 {Opt_nogrpid, "nogrpid"},
594 {Opt_nogrpid, "sysvgroups"},
595 {Opt_resgid, "resgid=%u"},
596 {Opt_resuid, "resuid=%u"},
597 {Opt_sb, "sb=%u"},
598 {Opt_err_cont, "errors=continue"},
599 {Opt_err_panic, "errors=panic"},
600 {Opt_err_ro, "errors=remount-ro"},
601 {Opt_nouid32, "nouid32"},
602 {Opt_nocheck, "nocheck"},
603 {Opt_nocheck, "check=none"},
604 {Opt_check, "check"},
605 {Opt_debug, "debug"},
606 {Opt_oldalloc, "oldalloc"},
607 {Opt_orlov, "orlov"},
608 {Opt_user_xattr, "user_xattr"},
609 {Opt_nouser_xattr, "nouser_xattr"},
610 {Opt_acl, "acl"},
611 {Opt_noacl, "noacl"},
612 {Opt_reservation, "reservation"},
613 {Opt_noreservation, "noreservation"},
614 {Opt_noload, "noload"},
615 {Opt_nobh, "nobh"},
616 {Opt_commit, "commit=%u"},
617 {Opt_journal_update, "journal=update"},
618 {Opt_journal_inum, "journal=%u"},
619 {Opt_abort, "abort"},
620 {Opt_data_journal, "data=journal"},
621 {Opt_data_ordered, "data=ordered"},
622 {Opt_data_writeback, "data=writeback"},
623 {Opt_offusrjquota, "usrjquota="},
624 {Opt_usrjquota, "usrjquota=%s"},
625 {Opt_offgrpjquota, "grpjquota="},
626 {Opt_grpjquota, "grpjquota=%s"},
627 {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
628 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
629 {Opt_ignore, "grpquota"},
630 {Opt_ignore, "noquota"},
631 {Opt_ignore, "quota"},
632 {Opt_ignore, "usrquota"},
633 {Opt_barrier, "barrier=%u"},
634 {Opt_err, NULL},
635 {Opt_resize, "resize"},
636};
637
638static unsigned long get_sb_block(void **data)
639{
640 unsigned long sb_block;
641 char *options = (char *) *data;
642
643 if (!options || strncmp(options, "sb=", 3) != 0)
644 return 1; /* Default location */
645 options += 3;
646 sb_block = simple_strtoul(options, &options, 0);
647 if (*options && *options != ',') {
648 printk("EXT3-fs: Invalid sb specification: %s\n",
649 (char *) *data);
650 return 1;
651 }
652 if (*options == ',')
653 options++;
654 *data = (void *) options;
655 return sb_block;
656}
657
658static int parse_options (char * options, struct super_block *sb,
659 unsigned long * inum, unsigned long *n_blocks_count, int is_remount)
660{
661 struct ext3_sb_info *sbi = EXT3_SB(sb);
662 char * p;
663 substring_t args[MAX_OPT_ARGS];
664 int data_opt = 0;
665 int option;
666#ifdef CONFIG_QUOTA
667 int qtype;
668 char *qname;
669#endif
670
671 if (!options)
672 return 1;
673
674 while ((p = strsep (&options, ",")) != NULL) {
675 int token;
676 if (!*p)
677 continue;
678
679 token = match_token(p, tokens, args);
680 switch (token) {
681 case Opt_bsd_df:
682 clear_opt (sbi->s_mount_opt, MINIX_DF);
683 break;
684 case Opt_minix_df:
685 set_opt (sbi->s_mount_opt, MINIX_DF);
686 break;
687 case Opt_grpid:
688 set_opt (sbi->s_mount_opt, GRPID);
689 break;
690 case Opt_nogrpid:
691 clear_opt (sbi->s_mount_opt, GRPID);
692 break;
693 case Opt_resuid:
694 if (match_int(&args[0], &option))
695 return 0;
696 sbi->s_resuid = option;
697 break;
698 case Opt_resgid:
699 if (match_int(&args[0], &option))
700 return 0;
701 sbi->s_resgid = option;
702 break;
703 case Opt_sb:
704 /* handled by get_sb_block() instead of here */
705 /* *sb_block = match_int(&args[0]); */
706 break;
707 case Opt_err_panic:
708 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
709 clear_opt (sbi->s_mount_opt, ERRORS_RO);
710 set_opt (sbi->s_mount_opt, ERRORS_PANIC);
711 break;
712 case Opt_err_ro:
713 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
714 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
715 set_opt (sbi->s_mount_opt, ERRORS_RO);
716 break;
717 case Opt_err_cont:
718 clear_opt (sbi->s_mount_opt, ERRORS_RO);
719 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
720 set_opt (sbi->s_mount_opt, ERRORS_CONT);
721 break;
722 case Opt_nouid32:
723 set_opt (sbi->s_mount_opt, NO_UID32);
724 break;
725 case Opt_check:
726#ifdef CONFIG_EXT3_CHECK
727 set_opt (sbi->s_mount_opt, CHECK);
728#else
729 printk(KERN_ERR
730 "EXT3 Check option not supported\n");
731#endif
732 break;
733 case Opt_nocheck:
734 clear_opt (sbi->s_mount_opt, CHECK);
735 break;
736 case Opt_debug:
737 set_opt (sbi->s_mount_opt, DEBUG);
738 break;
739 case Opt_oldalloc:
740 set_opt (sbi->s_mount_opt, OLDALLOC);
741 break;
742 case Opt_orlov:
743 clear_opt (sbi->s_mount_opt, OLDALLOC);
744 break;
745#ifdef CONFIG_EXT3_FS_XATTR
746 case Opt_user_xattr:
747 set_opt (sbi->s_mount_opt, XATTR_USER);
748 break;
749 case Opt_nouser_xattr:
750 clear_opt (sbi->s_mount_opt, XATTR_USER);
751 break;
752#else
753 case Opt_user_xattr:
754 case Opt_nouser_xattr:
755 printk("EXT3 (no)user_xattr options not supported\n");
756 break;
757#endif
758#ifdef CONFIG_EXT3_FS_POSIX_ACL
759 case Opt_acl:
760 set_opt(sbi->s_mount_opt, POSIX_ACL);
761 break;
762 case Opt_noacl:
763 clear_opt(sbi->s_mount_opt, POSIX_ACL);
764 break;
765#else
766 case Opt_acl:
767 case Opt_noacl:
768 printk("EXT3 (no)acl options not supported\n");
769 break;
770#endif
771 case Opt_reservation:
772 set_opt(sbi->s_mount_opt, RESERVATION);
773 break;
774 case Opt_noreservation:
775 clear_opt(sbi->s_mount_opt, RESERVATION);
776 break;
777 case Opt_journal_update:
778 /* @@@ FIXME */
779 /* Eventually we will want to be able to create
780 a journal file here. For now, only allow the
781 user to specify an existing inode to be the
782 journal file. */
783 if (is_remount) {
784 printk(KERN_ERR "EXT3-fs: cannot specify "
785 "journal on remount\n");
786 return 0;
787 }
788 set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
789 break;
790 case Opt_journal_inum:
791 if (is_remount) {
792 printk(KERN_ERR "EXT3-fs: cannot specify "
793 "journal on remount\n");
794 return 0;
795 }
796 if (match_int(&args[0], &option))
797 return 0;
798 *inum = option;
799 break;
800 case Opt_noload:
801 set_opt (sbi->s_mount_opt, NOLOAD);
802 break;
803 case Opt_commit:
804 if (match_int(&args[0], &option))
805 return 0;
806 if (option < 0)
807 return 0;
808 if (option == 0)
809 option = JBD_DEFAULT_MAX_COMMIT_AGE;
810 sbi->s_commit_interval = HZ * option;
811 break;
812 case Opt_data_journal:
813 data_opt = EXT3_MOUNT_JOURNAL_DATA;
814 goto datacheck;
815 case Opt_data_ordered:
816 data_opt = EXT3_MOUNT_ORDERED_DATA;
817 goto datacheck;
818 case Opt_data_writeback:
819 data_opt = EXT3_MOUNT_WRITEBACK_DATA;
820 datacheck:
821 if (is_remount) {
822 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
823 != data_opt) {
824 printk(KERN_ERR
825 "EXT3-fs: cannot change data "
826 "mode on remount\n");
827 return 0;
828 }
829 } else {
830 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
831 sbi->s_mount_opt |= data_opt;
832 }
833 break;
834#ifdef CONFIG_QUOTA
835 case Opt_usrjquota:
836 qtype = USRQUOTA;
837 goto set_qf_name;
838 case Opt_grpjquota:
839 qtype = GRPQUOTA;
840set_qf_name:
841 if (sb_any_quota_enabled(sb)) {
842 printk(KERN_ERR
843 "EXT3-fs: Cannot change journalled "
844 "quota options when quota turned on.\n");
845 return 0;
846 }
847 qname = match_strdup(&args[0]);
848 if (!qname) {
849 printk(KERN_ERR
850 "EXT3-fs: not enough memory for "
851 "storing quotafile name.\n");
852 return 0;
853 }
854 if (sbi->s_qf_names[qtype] &&
855 strcmp(sbi->s_qf_names[qtype], qname)) {
856 printk(KERN_ERR
857 "EXT3-fs: %s quota file already "
858 "specified.\n", QTYPE2NAME(qtype));
859 kfree(qname);
860 return 0;
861 }
862 sbi->s_qf_names[qtype] = qname;
863 if (strchr(sbi->s_qf_names[qtype], '/')) {
864 printk(KERN_ERR
865 "EXT3-fs: quotafile must be on "
866 "filesystem root.\n");
867 kfree(sbi->s_qf_names[qtype]);
868 sbi->s_qf_names[qtype] = NULL;
869 return 0;
870 }
871 break;
872 case Opt_offusrjquota:
873 qtype = USRQUOTA;
874 goto clear_qf_name;
875 case Opt_offgrpjquota:
876 qtype = GRPQUOTA;
877clear_qf_name:
878 if (sb_any_quota_enabled(sb)) {
879 printk(KERN_ERR "EXT3-fs: Cannot change "
880 "journalled quota options when "
881 "quota turned on.\n");
882 return 0;
883 }
884 kfree(sbi->s_qf_names[qtype]);
885 sbi->s_qf_names[qtype] = NULL;
886 break;
887 case Opt_jqfmt_vfsold:
888 sbi->s_jquota_fmt = QFMT_VFS_OLD;
889 break;
890 case Opt_jqfmt_vfsv0:
891 sbi->s_jquota_fmt = QFMT_VFS_V0;
892 break;
893#else
894 case Opt_usrjquota:
895 case Opt_grpjquota:
896 case Opt_offusrjquota:
897 case Opt_offgrpjquota:
898 case Opt_jqfmt_vfsold:
899 case Opt_jqfmt_vfsv0:
900 printk(KERN_ERR
901 "EXT3-fs: journalled quota options not "
902 "supported.\n");
903 break;
904#endif
905 case Opt_abort:
906 set_opt(sbi->s_mount_opt, ABORT);
907 break;
908 case Opt_barrier:
909 if (match_int(&args[0], &option))
910 return 0;
911 if (option)
912 set_opt(sbi->s_mount_opt, BARRIER);
913 else
914 clear_opt(sbi->s_mount_opt, BARRIER);
915 break;
916 case Opt_ignore:
917 break;
918 case Opt_resize:
919 if (!n_blocks_count) {
920 printk("EXT3-fs: resize option only available "
921 "for remount\n");
922 return 0;
923 }
924 match_int(&args[0], &option);
925 *n_blocks_count = option;
926 break;
927 case Opt_nobh:
928 set_opt(sbi->s_mount_opt, NOBH);
929 break;
930 default:
931 printk (KERN_ERR
932 "EXT3-fs: Unrecognized mount option \"%s\" "
933 "or missing value\n", p);
934 return 0;
935 }
936 }
937#ifdef CONFIG_QUOTA
938 if (!sbi->s_jquota_fmt && (sbi->s_qf_names[USRQUOTA] ||
939 sbi->s_qf_names[GRPQUOTA])) {
940 printk(KERN_ERR
941 "EXT3-fs: journalled quota format not specified.\n");
942 return 0;
943 }
944#endif
945
946 return 1;
947}
948
949static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
950 int read_only)
951{
952 struct ext3_sb_info *sbi = EXT3_SB(sb);
953 int res = 0;
954
955 if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
956 printk (KERN_ERR "EXT3-fs warning: revision level too high, "
957 "forcing read-only mode\n");
958 res = MS_RDONLY;
959 }
960 if (read_only)
961 return res;
962 if (!(sbi->s_mount_state & EXT3_VALID_FS))
963 printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
964 "running e2fsck is recommended\n");
965 else if ((sbi->s_mount_state & EXT3_ERROR_FS))
966 printk (KERN_WARNING
967 "EXT3-fs warning: mounting fs with errors, "
968 "running e2fsck is recommended\n");
969 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
970 le16_to_cpu(es->s_mnt_count) >=
971 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
972 printk (KERN_WARNING
973 "EXT3-fs warning: maximal mount count reached, "
974 "running e2fsck is recommended\n");
975 else if (le32_to_cpu(es->s_checkinterval) &&
976 (le32_to_cpu(es->s_lastcheck) +
977 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
978 printk (KERN_WARNING
979 "EXT3-fs warning: checktime reached, "
980 "running e2fsck is recommended\n");
981#if 0
982 /* @@@ We _will_ want to clear the valid bit if we find
983 inconsistencies, to force a fsck at reboot. But for
984 a plain journaled filesystem we can keep it set as
985 valid forever! :) */
986 es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS);
987#endif
988 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
989 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
990 es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
991 es->s_mtime = cpu_to_le32(get_seconds());
992 ext3_update_dynamic_rev(sb);
993 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
994
995 ext3_commit_super(sb, es, 1);
996 if (test_opt(sb, DEBUG))
997 printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, "
998 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
999 sb->s_blocksize,
1000 sbi->s_groups_count,
1001 EXT3_BLOCKS_PER_GROUP(sb),
1002 EXT3_INODES_PER_GROUP(sb),
1003 sbi->s_mount_opt);
1004
1005 printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id);
1006 if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
1007 char b[BDEVNAME_SIZE];
1008
1009 printk("external journal on %s\n",
1010 bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
1011 } else {
1012 printk("internal journal\n");
1013 }
1014#ifdef CONFIG_EXT3_CHECK
1015 if (test_opt (sb, CHECK)) {
1016 ext3_check_blocks_bitmap (sb);
1017 ext3_check_inodes_bitmap (sb);
1018 }
1019#endif
1020 return res;
1021}
1022
1023/* Called at mount-time, super-block is locked */
1024static int ext3_check_descriptors (struct super_block * sb)
1025{
1026 struct ext3_sb_info *sbi = EXT3_SB(sb);
1027 unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
1028 struct ext3_group_desc * gdp = NULL;
1029 int desc_block = 0;
1030 int i;
1031
1032 ext3_debug ("Checking group descriptors");
1033
1034 for (i = 0; i < sbi->s_groups_count; i++)
1035 {
1036 if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
1037 gdp = (struct ext3_group_desc *)
1038 sbi->s_group_desc[desc_block++]->b_data;
1039 if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
1040 le32_to_cpu(gdp->bg_block_bitmap) >=
1041 block + EXT3_BLOCKS_PER_GROUP(sb))
1042 {
1043 ext3_error (sb, "ext3_check_descriptors",
1044 "Block bitmap for group %d"
1045 " not in group (block %lu)!",
1046 i, (unsigned long)
1047 le32_to_cpu(gdp->bg_block_bitmap));
1048 return 0;
1049 }
1050 if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
1051 le32_to_cpu(gdp->bg_inode_bitmap) >=
1052 block + EXT3_BLOCKS_PER_GROUP(sb))
1053 {
1054 ext3_error (sb, "ext3_check_descriptors",
1055 "Inode bitmap for group %d"
1056 " not in group (block %lu)!",
1057 i, (unsigned long)
1058 le32_to_cpu(gdp->bg_inode_bitmap));
1059 return 0;
1060 }
1061 if (le32_to_cpu(gdp->bg_inode_table) < block ||
1062 le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
1063 block + EXT3_BLOCKS_PER_GROUP(sb))
1064 {
1065 ext3_error (sb, "ext3_check_descriptors",
1066 "Inode table for group %d"
1067 " not in group (block %lu)!",
1068 i, (unsigned long)
1069 le32_to_cpu(gdp->bg_inode_table));
1070 return 0;
1071 }
1072 block += EXT3_BLOCKS_PER_GROUP(sb);
1073 gdp++;
1074 }
1075
1076 sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb));
1077 sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb));
1078 return 1;
1079}
1080
1081
1082/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
1083 * the superblock) which were deleted from all directories, but held open by
1084 * a process at the time of a crash. We walk the list and try to delete these
1085 * inodes at recovery time (only with a read-write filesystem).
1086 *
1087 * In order to keep the orphan inode chain consistent during traversal (in
1088 * case of crash during recovery), we link each inode into the superblock
1089 * orphan list_head and handle it the same way as an inode deletion during
1090 * normal operation (which journals the operations for us).
1091 *
1092 * We only do an iget() and an iput() on each inode, which is very safe if we
1093 * accidentally point at an in-use or already deleted inode. The worst that
1094 * can happen in this case is that we get a "bit already cleared" message from
1095 * ext3_free_inode(). The only reason we would point at a wrong inode is if
1096 * e2fsck was run on this filesystem, and it must have already done the orphan
1097 * inode cleanup for us, so we can safely abort without any further action.
1098 */
1099static void ext3_orphan_cleanup (struct super_block * sb,
1100 struct ext3_super_block * es)
1101{
1102 unsigned int s_flags = sb->s_flags;
1103 int nr_orphans = 0, nr_truncates = 0;
1104#ifdef CONFIG_QUOTA
1105 int i;
1106#endif
1107 if (!es->s_last_orphan) {
1108 jbd_debug(4, "no orphan inodes to clean up\n");
1109 return;
1110 }
1111
1112 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
1113 if (es->s_last_orphan)
1114 jbd_debug(1, "Errors on filesystem, "
1115 "clearing orphan list.\n");
1116 es->s_last_orphan = 0;
1117 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
1118 return;
1119 }
1120
1121 if (s_flags & MS_RDONLY) {
1122 printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
1123 sb->s_id);
1124 sb->s_flags &= ~MS_RDONLY;
1125 }
1126#ifdef CONFIG_QUOTA
1127 /* Needed for iput() to work correctly and not trash data */
1128 sb->s_flags |= MS_ACTIVE;
1129 /* Turn on quotas so that they are updated correctly */
1130 for (i = 0; i < MAXQUOTAS; i++) {
1131 if (EXT3_SB(sb)->s_qf_names[i]) {
1132 int ret = ext3_quota_on_mount(sb, i);
1133 if (ret < 0)
1134 printk(KERN_ERR
1135 "EXT3-fs: Cannot turn on journalled "
1136 "quota: error %d\n", ret);
1137 }
1138 }
1139#endif
1140
1141 while (es->s_last_orphan) {
1142 struct inode *inode;
1143
1144 if (!(inode =
1145 ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
1146 es->s_last_orphan = 0;
1147 break;
1148 }
1149
1150 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1151 DQUOT_INIT(inode);
1152 if (inode->i_nlink) {
1153 printk(KERN_DEBUG
1154 "%s: truncating inode %ld to %Ld bytes\n",
1155 __FUNCTION__, inode->i_ino, inode->i_size);
1156 jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
1157 inode->i_ino, inode->i_size);
1158 ext3_truncate(inode);
1159 nr_truncates++;
1160 } else {
1161 printk(KERN_DEBUG
1162 "%s: deleting unreferenced inode %ld\n",
1163 __FUNCTION__, inode->i_ino);
1164 jbd_debug(2, "deleting unreferenced inode %ld\n",
1165 inode->i_ino);
1166 nr_orphans++;
1167 }
1168 iput(inode); /* The delete magic happens here! */
1169 }
1170
1171#define PLURAL(x) (x), ((x)==1) ? "" : "s"
1172
1173 if (nr_orphans)
1174 printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
1175 sb->s_id, PLURAL(nr_orphans));
1176 if (nr_truncates)
1177 printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
1178 sb->s_id, PLURAL(nr_truncates));
1179#ifdef CONFIG_QUOTA
1180 /* Turn quotas off */
1181 for (i = 0; i < MAXQUOTAS; i++) {
1182 if (sb_dqopt(sb)->files[i])
1183 vfs_quota_off(sb, i);
1184 }
1185#endif
1186 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1187}
1188
1189#define log2(n) ffz(~(n))
1190
1191/*
1192 * Maximal file size. There is a direct, and {,double-,triple-}indirect
1193 * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
1194 * We need to be 1 filesystem block less than the 2^32 sector limit.
1195 */
1196static loff_t ext3_max_size(int bits)
1197{
1198 loff_t res = EXT3_NDIR_BLOCKS;
1199 /* This constant is calculated to be the largest file size for a
1200 * dense, 4k-blocksize file such that the total number of
1201 * sectors in the file, including data and all indirect blocks,
1202 * does not exceed 2^32. */
1203 const loff_t upper_limit = 0x1ff7fffd000LL;
1204
1205 res += 1LL << (bits-2);
1206 res += 1LL << (2*(bits-2));
1207 res += 1LL << (3*(bits-2));
1208 res <<= bits;
1209 if (res > upper_limit)
1210 res = upper_limit;
1211 return res;
1212}
1213
1214static unsigned long descriptor_loc(struct super_block *sb,
1215 unsigned long logic_sb_block,
1216 int nr)
1217{
1218 struct ext3_sb_info *sbi = EXT3_SB(sb);
1219 unsigned long bg, first_data_block, first_meta_bg;
1220 int has_super = 0;
1221
1222 first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
1223 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
1224
1225 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
1226 nr < first_meta_bg)
1227 return (logic_sb_block + nr + 1);
1228 bg = sbi->s_desc_per_block * nr;
1229 if (ext3_bg_has_super(sb, bg))
1230 has_super = 1;
1231 return (first_data_block + has_super + (bg * sbi->s_blocks_per_group));
1232}
1233
1234
1235static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1236{
1237 struct buffer_head * bh;
1238 struct ext3_super_block *es = NULL;
1239 struct ext3_sb_info *sbi;
1240 unsigned long block;
1241 unsigned long sb_block = get_sb_block(&data);
1242 unsigned long logic_sb_block;
1243 unsigned long offset = 0;
1244 unsigned long journal_inum = 0;
1245 unsigned long def_mount_opts;
1246 struct inode *root;
1247 int blocksize;
1248 int hblock;
1249 int db_count;
1250 int i;
1251 int needs_recovery;
1252 __le32 features;
1253
1254 sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
1255 if (!sbi)
1256 return -ENOMEM;
1257 sb->s_fs_info = sbi;
1258 memset(sbi, 0, sizeof(*sbi));
1259 sbi->s_mount_opt = 0;
1260 sbi->s_resuid = EXT3_DEF_RESUID;
1261 sbi->s_resgid = EXT3_DEF_RESGID;
1262
1263 unlock_kernel();
1264
1265 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
1266 if (!blocksize) {
1267 printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
1268 goto out_fail;
1269 }
1270
1271 /*
1272 * The ext3 superblock will not be buffer aligned for other than 1kB
1273 * block sizes. We need to calculate the offset from buffer start.
1274 */
1275 if (blocksize != EXT3_MIN_BLOCK_SIZE) {
1276 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
1277 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
1278 } else {
1279 logic_sb_block = sb_block;
1280 }
1281
1282 if (!(bh = sb_bread(sb, logic_sb_block))) {
1283 printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
1284 goto out_fail;
1285 }
1286 /*
1287 * Note: s_es must be initialized as soon as possible because
1288 * some ext3 macro-instructions depend on its value
1289 */
1290 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
1291 sbi->s_es = es;
1292 sb->s_magic = le16_to_cpu(es->s_magic);
1293 if (sb->s_magic != EXT3_SUPER_MAGIC)
1294 goto cantfind_ext3;
1295
1296 /* Set defaults before we parse the mount options */
1297 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1298 if (def_mount_opts & EXT3_DEFM_DEBUG)
1299 set_opt(sbi->s_mount_opt, DEBUG);
1300 if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
1301 set_opt(sbi->s_mount_opt, GRPID);
1302 if (def_mount_opts & EXT3_DEFM_UID16)
1303 set_opt(sbi->s_mount_opt, NO_UID32);
1304 if (def_mount_opts & EXT3_DEFM_XATTR_USER)
1305 set_opt(sbi->s_mount_opt, XATTR_USER);
1306 if (def_mount_opts & EXT3_DEFM_ACL)
1307 set_opt(sbi->s_mount_opt, POSIX_ACL);
1308 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
1309 sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
1310 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
1311 sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
1312 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
1313 sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;
1314
1315 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
1316 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1317 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
1318 set_opt(sbi->s_mount_opt, ERRORS_RO);
1319
1320 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1321 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
1322
1323 set_opt(sbi->s_mount_opt, RESERVATION);
1324
1325 if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0))
1326 goto failed_mount;
1327
1328 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1329 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1330
1331 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
1332 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
1333 EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
1334 EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
1335 printk(KERN_WARNING
1336 "EXT3-fs warning: feature flags set on rev 0 fs, "
1337 "running e2fsck is recommended\n");
1338 /*
1339 * Check feature flags regardless of the revision level, since we
1340 * previously didn't change the revision level when setting the flags,
1341 * so there is a chance incompat flags are set on a rev 0 filesystem.
1342 */
1343 features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
1344 if (features) {
1345 printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
1346 "unsupported optional features (%x).\n",
1347 sb->s_id, le32_to_cpu(features));
1348 goto failed_mount;
1349 }
1350 features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
1351 if (!(sb->s_flags & MS_RDONLY) && features) {
1352 printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
1353 "unsupported optional features (%x).\n",
1354 sb->s_id, le32_to_cpu(features));
1355 goto failed_mount;
1356 }
1357 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1358
1359 if (blocksize < EXT3_MIN_BLOCK_SIZE ||
1360 blocksize > EXT3_MAX_BLOCK_SIZE) {
1361 printk(KERN_ERR
1362 "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
1363 blocksize, sb->s_id);
1364 goto failed_mount;
1365 }
1366
1367 hblock = bdev_hardsect_size(sb->s_bdev);
1368 if (sb->s_blocksize != blocksize) {
1369 /*
1370 * Make sure the blocksize for the filesystem is larger
1371 * than the hardware sectorsize for the machine.
1372 */
1373 if (blocksize < hblock) {
1374 printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
1375 "device blocksize %d.\n", blocksize, hblock);
1376 goto failed_mount;
1377 }
1378
1379 brelse (bh);
1380 sb_set_blocksize(sb, blocksize);
1381 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
1382 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
1383 bh = sb_bread(sb, logic_sb_block);
1384 if (!bh) {
1385 printk(KERN_ERR
1386 "EXT3-fs: Can't read superblock on 2nd try.\n");
1387 goto failed_mount;
1388 }
1389 es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
1390 sbi->s_es = es;
1391 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
1392 printk (KERN_ERR
1393 "EXT3-fs: Magic mismatch, very weird !\n");
1394 goto failed_mount;
1395 }
1396 }
1397
1398 sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
1399
1400 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
1401 sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
1402 sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
1403 } else {
1404 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1405 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1406 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
1407 (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
1408 (sbi->s_inode_size > blocksize)) {
1409 printk (KERN_ERR
1410 "EXT3-fs: unsupported inode size: %d\n",
1411 sbi->s_inode_size);
1412 goto failed_mount;
1413 }
1414 }
1415 sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
1416 le32_to_cpu(es->s_log_frag_size);
1417 if (blocksize != sbi->s_frag_size) {
1418 printk(KERN_ERR
1419 "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
1420 sbi->s_frag_size, blocksize);
1421 goto failed_mount;
1422 }
1423 sbi->s_frags_per_block = 1;
1424 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
1425 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
1426 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1427 if (EXT3_INODE_SIZE(sb) == 0)
1428 goto cantfind_ext3;
1429 sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
1430 if (sbi->s_inodes_per_block == 0)
1431 goto cantfind_ext3;
1432 sbi->s_itb_per_group = sbi->s_inodes_per_group /
1433 sbi->s_inodes_per_block;
1434 sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
1435 sbi->s_sbh = bh;
1436 sbi->s_mount_state = le16_to_cpu(es->s_state);
1437 sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
1438 sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
1439 for (i=0; i < 4; i++)
1440 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1441 sbi->s_def_hash_version = es->s_def_hash_version;
1442
1443 if (sbi->s_blocks_per_group > blocksize * 8) {
1444 printk (KERN_ERR
1445 "EXT3-fs: #blocks per group too big: %lu\n",
1446 sbi->s_blocks_per_group);
1447 goto failed_mount;
1448 }
1449 if (sbi->s_frags_per_group > blocksize * 8) {
1450 printk (KERN_ERR
1451 "EXT3-fs: #fragments per group too big: %lu\n",
1452 sbi->s_frags_per_group);
1453 goto failed_mount;
1454 }
1455 if (sbi->s_inodes_per_group > blocksize * 8) {
1456 printk (KERN_ERR
1457 "EXT3-fs: #inodes per group too big: %lu\n",
1458 sbi->s_inodes_per_group);
1459 goto failed_mount;
1460 }
1461
1462 if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
1463 goto cantfind_ext3;
1464 sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
1465 le32_to_cpu(es->s_first_data_block) +
1466 EXT3_BLOCKS_PER_GROUP(sb) - 1) /
1467 EXT3_BLOCKS_PER_GROUP(sb);
1468 db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
1469 EXT3_DESC_PER_BLOCK(sb);
1470 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1471 GFP_KERNEL);
1472 if (sbi->s_group_desc == NULL) {
1473 printk (KERN_ERR "EXT3-fs: not enough memory\n");
1474 goto failed_mount;
1475 }
1476
1477 percpu_counter_init(&sbi->s_freeblocks_counter);
1478 percpu_counter_init(&sbi->s_freeinodes_counter);
1479 percpu_counter_init(&sbi->s_dirs_counter);
1480 bgl_lock_init(&sbi->s_blockgroup_lock);
1481
1482 for (i = 0; i < db_count; i++) {
1483 block = descriptor_loc(sb, logic_sb_block, i);
1484 sbi->s_group_desc[i] = sb_bread(sb, block);
1485 if (!sbi->s_group_desc[i]) {
1486 printk (KERN_ERR "EXT3-fs: "
1487 "can't read group descriptor %d\n", i);
1488 db_count = i;
1489 goto failed_mount2;
1490 }
1491 }
1492 if (!ext3_check_descriptors (sb)) {
1493 printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
1494 goto failed_mount2;
1495 }
1496 sbi->s_gdb_count = db_count;
1497 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1498 spin_lock_init(&sbi->s_next_gen_lock);
1499 /* per fileystem reservation list head & lock */
1500 spin_lock_init(&sbi->s_rsv_window_lock);
1501 sbi->s_rsv_window_root = RB_ROOT;
1502 /* Add a single, static dummy reservation to the start of the
1503 * reservation window list --- it gives us a placeholder for
1504 * append-at-start-of-list which makes the allocation logic
1505 * _much_ simpler. */
1506 sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
1507 sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
1508 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
1509 sbi->s_rsv_window_head.rsv_goal_size = 0;
1510 ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
1511
1512 /*
1513 * set up enough so that it can read an inode
1514 */
1515 sb->s_op = &ext3_sops;
1516 sb->s_export_op = &ext3_export_ops;
1517 sb->s_xattr = ext3_xattr_handlers;
1518#ifdef CONFIG_QUOTA
1519 sb->s_qcop = &ext3_qctl_operations;
1520 sb->dq_op = &ext3_quota_operations;
1521#endif
1522 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
1523
1524 sb->s_root = NULL;
1525
1526 needs_recovery = (es->s_last_orphan != 0 ||
1527 EXT3_HAS_INCOMPAT_FEATURE(sb,
1528 EXT3_FEATURE_INCOMPAT_RECOVER));
1529
1530 /*
1531 * The first inode we look at is the journal inode. Don't try
1532 * root first: it may be modified in the journal!
1533 */
1534 if (!test_opt(sb, NOLOAD) &&
1535 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
1536 if (ext3_load_journal(sb, es))
1537 goto failed_mount2;
1538 } else if (journal_inum) {
1539 if (ext3_create_journal(sb, es, journal_inum))
1540 goto failed_mount2;
1541 } else {
1542 if (!silent)
1543 printk (KERN_ERR
1544 "ext3: No journal on filesystem on %s\n",
1545 sb->s_id);
1546 goto failed_mount2;
1547 }
1548
1549 /* We have now updated the journal if required, so we can
1550 * validate the data journaling mode. */
1551 switch (test_opt(sb, DATA_FLAGS)) {
1552 case 0:
1553 /* No mode set, assume a default based on the journal
1554 capabilities: ORDERED_DATA if the journal can
1555 cope, else JOURNAL_DATA */
1556 if (journal_check_available_features
1557 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
1558 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1559 else
1560 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1561 break;
1562
1563 case EXT3_MOUNT_ORDERED_DATA:
1564 case EXT3_MOUNT_WRITEBACK_DATA:
1565 if (!journal_check_available_features
1566 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
1567 printk(KERN_ERR "EXT3-fs: Journal does not support "
1568 "requested data journaling mode\n");
1569 goto failed_mount3;
1570 }
1571 default:
1572 break;
1573 }
1574
1575 if (test_opt(sb, NOBH)) {
1576 if (sb->s_blocksize_bits != PAGE_CACHE_SHIFT) {
1577 printk(KERN_WARNING "EXT3-fs: Ignoring nobh option "
1578 "since filesystem blocksize doesn't match "
1579 "pagesize\n");
1580 clear_opt(sbi->s_mount_opt, NOBH);
1581 }
1582 if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
1583 printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - "
1584 "its supported only with writeback mode\n");
1585 clear_opt(sbi->s_mount_opt, NOBH);
1586 }
1587 }
1588 /*
1589 * The journal_load will have done any necessary log recovery,
1590 * so we can safely mount the rest of the filesystem now.
1591 */
1592
1593 root = iget(sb, EXT3_ROOT_INO);
1594 sb->s_root = d_alloc_root(root);
1595 if (!sb->s_root) {
1596 printk(KERN_ERR "EXT3-fs: get root inode failed\n");
1597 iput(root);
1598 goto failed_mount3;
1599 }
1600 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
1601 dput(sb->s_root);
1602 sb->s_root = NULL;
1603 printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
1604 goto failed_mount3;
1605 }
1606
1607 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1608 /*
1609 * akpm: core read_super() calls in here with the superblock locked.
1610 * That deadlocks, because orphan cleanup needs to lock the superblock
1611 * in numerous places. Here we just pop the lock - it's relatively
1612 * harmless, because we are now ready to accept write_super() requests,
1613 * and aviro says that's the only reason for hanging onto the
1614 * superblock lock.
1615 */
1616 EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
1617 ext3_orphan_cleanup(sb, es);
1618 EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
1619 if (needs_recovery)
1620 printk (KERN_INFO "EXT3-fs: recovery complete.\n");
1621 ext3_mark_recovery_complete(sb, es);
1622 printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
1623 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
1624 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
1625 "writeback");
1626
1627 percpu_counter_mod(&sbi->s_freeblocks_counter,
1628 ext3_count_free_blocks(sb));
1629 percpu_counter_mod(&sbi->s_freeinodes_counter,
1630 ext3_count_free_inodes(sb));
1631 percpu_counter_mod(&sbi->s_dirs_counter,
1632 ext3_count_dirs(sb));
1633
1634 lock_kernel();
1635 return 0;
1636
1637cantfind_ext3:
1638 if (!silent)
1639 printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n",
1640 sb->s_id);
1641 goto failed_mount;
1642
1643failed_mount3:
1644 journal_destroy(sbi->s_journal);
1645failed_mount2:
1646 for (i = 0; i < db_count; i++)
1647 brelse(sbi->s_group_desc[i]);
1648 kfree(sbi->s_group_desc);
1649failed_mount:
1650#ifdef CONFIG_QUOTA
1651 for (i = 0; i < MAXQUOTAS; i++)
1652 kfree(sbi->s_qf_names[i]);
1653#endif
1654 ext3_blkdev_remove(sbi);
1655 brelse(bh);
1656out_fail:
1657 sb->s_fs_info = NULL;
1658 kfree(sbi);
1659 lock_kernel();
1660 return -EINVAL;
1661}
1662
1663/*
1664 * Setup any per-fs journal parameters now. We'll do this both on
1665 * initial mount, once the journal has been initialised but before we've
1666 * done any recovery; and again on any subsequent remount.
1667 */
1668static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
1669{
1670 struct ext3_sb_info *sbi = EXT3_SB(sb);
1671
1672 if (sbi->s_commit_interval)
1673 journal->j_commit_interval = sbi->s_commit_interval;
1674 /* We could also set up an ext3-specific default for the commit
1675 * interval here, but for now we'll just fall back to the jbd
1676 * default. */
1677
1678 spin_lock(&journal->j_state_lock);
1679 if (test_opt(sb, BARRIER))
1680 journal->j_flags |= JFS_BARRIER;
1681 else
1682 journal->j_flags &= ~JFS_BARRIER;
1683 spin_unlock(&journal->j_state_lock);
1684}
1685
1686static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
1687{
1688 struct inode *journal_inode;
1689 journal_t *journal;
1690
1691 /* First, test for the existence of a valid inode on disk. Bad
1692 * things happen if we iget() an unused inode, as the subsequent
1693 * iput() will try to delete it. */
1694
1695 journal_inode = iget(sb, journal_inum);
1696 if (!journal_inode) {
1697 printk(KERN_ERR "EXT3-fs: no journal found.\n");
1698 return NULL;
1699 }
1700 if (!journal_inode->i_nlink) {
1701 make_bad_inode(journal_inode);
1702 iput(journal_inode);
1703 printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
1704 return NULL;
1705 }
1706
1707 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
1708 journal_inode, journal_inode->i_size);
1709 if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
1710 printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
1711 iput(journal_inode);
1712 return NULL;
1713 }
1714
1715 journal = journal_init_inode(journal_inode);
1716 if (!journal) {
1717 printk(KERN_ERR "EXT3-fs: Could not load journal inode\n");
1718 iput(journal_inode);
1719 return NULL;
1720 }
1721 journal->j_private = sb;
1722 ext3_init_journal_params(sb, journal);
1723 return journal;
1724}
1725
1726static journal_t *ext3_get_dev_journal(struct super_block *sb,
1727 dev_t j_dev)
1728{
1729 struct buffer_head * bh;
1730 journal_t *journal;
1731 int start;
1732 int len;
1733 int hblock, blocksize;
1734 unsigned long sb_block;
1735 unsigned long offset;
1736 struct ext3_super_block * es;
1737 struct block_device *bdev;
1738
1739 bdev = ext3_blkdev_get(j_dev);
1740 if (bdev == NULL)
1741 return NULL;
1742
1743 if (bd_claim(bdev, sb)) {
1744 printk(KERN_ERR
1745 "EXT3: failed to claim external journal device.\n");
1746 blkdev_put(bdev);
1747 return NULL;
1748 }
1749
1750 blocksize = sb->s_blocksize;
1751 hblock = bdev_hardsect_size(bdev);
1752 if (blocksize < hblock) {
1753 printk(KERN_ERR
1754 "EXT3-fs: blocksize too small for journal device.\n");
1755 goto out_bdev;
1756 }
1757
1758 sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
1759 offset = EXT3_MIN_BLOCK_SIZE % blocksize;
1760 set_blocksize(bdev, blocksize);
1761 if (!(bh = __bread(bdev, sb_block, blocksize))) {
1762 printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
1763 "external journal\n");
1764 goto out_bdev;
1765 }
1766
1767 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
1768 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
1769 !(le32_to_cpu(es->s_feature_incompat) &
1770 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
1771 printk(KERN_ERR "EXT3-fs: external journal has "
1772 "bad superblock\n");
1773 brelse(bh);
1774 goto out_bdev;
1775 }
1776
1777 if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
1778 printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
1779 brelse(bh);
1780 goto out_bdev;
1781 }
1782
1783 len = le32_to_cpu(es->s_blocks_count);
1784 start = sb_block + 1;
1785 brelse(bh); /* we're done with the superblock */
1786
1787 journal = journal_init_dev(bdev, sb->s_bdev,
1788 start, len, blocksize);
1789 if (!journal) {
1790 printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
1791 goto out_bdev;
1792 }
1793 journal->j_private = sb;
1794 ll_rw_block(READ, 1, &journal->j_sb_buffer);
1795 wait_on_buffer(journal->j_sb_buffer);
1796 if (!buffer_uptodate(journal->j_sb_buffer)) {
1797 printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
1798 goto out_journal;
1799 }
1800 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
1801 printk(KERN_ERR "EXT3-fs: External journal has more than one "
1802 "user (unsupported) - %d\n",
1803 be32_to_cpu(journal->j_superblock->s_nr_users));
1804 goto out_journal;
1805 }
1806 EXT3_SB(sb)->journal_bdev = bdev;
1807 ext3_init_journal_params(sb, journal);
1808 return journal;
1809out_journal:
1810 journal_destroy(journal);
1811out_bdev:
1812 ext3_blkdev_put(bdev);
1813 return NULL;
1814}
1815
1816static int ext3_load_journal(struct super_block * sb,
1817 struct ext3_super_block * es)
1818{
1819 journal_t *journal;
1820 int journal_inum = le32_to_cpu(es->s_journal_inum);
1821 dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
1822 int err = 0;
1823 int really_read_only;
1824
1825 really_read_only = bdev_read_only(sb->s_bdev);
1826
1827 /*
1828 * Are we loading a blank journal or performing recovery after a
1829 * crash? For recovery, we need to check in advance whether we
1830 * can get read-write access to the device.
1831 */
1832
1833 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
1834 if (sb->s_flags & MS_RDONLY) {
1835 printk(KERN_INFO "EXT3-fs: INFO: recovery "
1836 "required on readonly filesystem.\n");
1837 if (really_read_only) {
1838 printk(KERN_ERR "EXT3-fs: write access "
1839 "unavailable, cannot proceed.\n");
1840 return -EROFS;
1841 }
1842 printk (KERN_INFO "EXT3-fs: write access will "
1843 "be enabled during recovery.\n");
1844 }
1845 }
1846
1847 if (journal_inum && journal_dev) {
1848 printk(KERN_ERR "EXT3-fs: filesystem has both journal "
1849 "and inode journals!\n");
1850 return -EINVAL;
1851 }
1852
1853 if (journal_inum) {
1854 if (!(journal = ext3_get_journal(sb, journal_inum)))
1855 return -EINVAL;
1856 } else {
1857 if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
1858 return -EINVAL;
1859 }
1860
1861 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
1862 err = journal_update_format(journal);
1863 if (err) {
1864 printk(KERN_ERR "EXT3-fs: error updating journal.\n");
1865 journal_destroy(journal);
1866 return err;
1867 }
1868 }
1869
1870 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
1871 err = journal_wipe(journal, !really_read_only);
1872 if (!err)
1873 err = journal_load(journal);
1874
1875 if (err) {
1876 printk(KERN_ERR "EXT3-fs: error loading journal.\n");
1877 journal_destroy(journal);
1878 return err;
1879 }
1880
1881 EXT3_SB(sb)->s_journal = journal;
1882 ext3_clear_journal_err(sb, es);
1883 return 0;
1884}
1885
1886static int ext3_create_journal(struct super_block * sb,
1887 struct ext3_super_block * es,
1888 int journal_inum)
1889{
1890 journal_t *journal;
1891
1892 if (sb->s_flags & MS_RDONLY) {
1893 printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
1894 "create journal.\n");
1895 return -EROFS;
1896 }
1897
1898 if (!(journal = ext3_get_journal(sb, journal_inum)))
1899 return -EINVAL;
1900
1901 printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n",
1902 journal_inum);
1903
1904 if (journal_create(journal)) {
1905 printk(KERN_ERR "EXT3-fs: error creating journal.\n");
1906 journal_destroy(journal);
1907 return -EIO;
1908 }
1909
1910 EXT3_SB(sb)->s_journal = journal;
1911
1912 ext3_update_dynamic_rev(sb);
1913 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
1914 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
1915
1916 es->s_journal_inum = cpu_to_le32(journal_inum);
1917 sb->s_dirt = 1;
1918
1919 /* Make sure we flush the recovery flag to disk. */
1920 ext3_commit_super(sb, es, 1);
1921
1922 return 0;
1923}
1924
1925static void ext3_commit_super (struct super_block * sb,
1926 struct ext3_super_block * es,
1927 int sync)
1928{
1929 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
1930
1931 if (!sbh)
1932 return;
1933 es->s_wtime = cpu_to_le32(get_seconds());
1934 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
1935 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
1936 BUFFER_TRACE(sbh, "marking dirty");
1937 mark_buffer_dirty(sbh);
1938 if (sync)
1939 sync_dirty_buffer(sbh);
1940}
1941
1942
1943/*
1944 * Have we just finished recovery? If so, and if we are mounting (or
1945 * remounting) the filesystem readonly, then we will end up with a
1946 * consistent fs on disk. Record that fact.
1947 */
1948static void ext3_mark_recovery_complete(struct super_block * sb,
1949 struct ext3_super_block * es)
1950{
1951 journal_t *journal = EXT3_SB(sb)->s_journal;
1952
1953 journal_lock_updates(journal);
1954 journal_flush(journal);
1955 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
1956 sb->s_flags & MS_RDONLY) {
1957 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
1958 sb->s_dirt = 0;
1959 ext3_commit_super(sb, es, 1);
1960 }
1961 journal_unlock_updates(journal);
1962}
1963
1964/*
1965 * If we are mounting (or read-write remounting) a filesystem whose journal
1966 * has recorded an error from a previous lifetime, move that error to the
1967 * main filesystem now.
1968 */
1969static void ext3_clear_journal_err(struct super_block * sb,
1970 struct ext3_super_block * es)
1971{
1972 journal_t *journal;
1973 int j_errno;
1974 const char *errstr;
1975
1976 journal = EXT3_SB(sb)->s_journal;
1977
1978 /*
1979 * Now check for any error status which may have been recorded in the
1980 * journal by a prior ext3_error() or ext3_abort()
1981 */
1982
1983 j_errno = journal_errno(journal);
1984 if (j_errno) {
1985 char nbuf[16];
1986
1987 errstr = ext3_decode_error(sb, j_errno, nbuf);
1988 ext3_warning(sb, __FUNCTION__, "Filesystem error recorded "
1989 "from previous mount: %s", errstr);
1990 ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
1991 "filesystem check.");
1992
1993 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
1994 es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
1995 ext3_commit_super (sb, es, 1);
1996
1997 journal_clear_err(journal);
1998 }
1999}
2000
2001/*
2002 * Force the running and committing transactions to commit,
2003 * and wait on the commit.
2004 */
2005int ext3_force_commit(struct super_block *sb)
2006{
2007 journal_t *journal;
2008 int ret;
2009
2010 if (sb->s_flags & MS_RDONLY)
2011 return 0;
2012
2013 journal = EXT3_SB(sb)->s_journal;
2014 sb->s_dirt = 0;
2015 ret = ext3_journal_force_commit(journal);
2016 return ret;
2017}
2018
2019/*
2020 * Ext3 always journals updates to the superblock itself, so we don't
2021 * have to propagate any other updates to the superblock on disk at this
2022 * point. Just start an async writeback to get the buffers on their way
2023 * to the disk.
2024 *
2025 * This implicitly triggers the writebehind on sync().
2026 */
2027
2028static void ext3_write_super (struct super_block * sb)
2029{
2030 if (down_trylock(&sb->s_lock) == 0)
2031 BUG();
2032 sb->s_dirt = 0;
2033}
2034
2035static int ext3_sync_fs(struct super_block *sb, int wait)
2036{
2037 tid_t target;
2038
2039 sb->s_dirt = 0;
2040 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2041 if (wait)
2042 log_wait_commit(EXT3_SB(sb)->s_journal, target);
2043 }
2044 return 0;
2045}
2046
2047/*
2048 * LVM calls this function before a (read-only) snapshot is created. This
2049 * gives us a chance to flush the journal completely and mark the fs clean.
2050 */
2051static void ext3_write_super_lockfs(struct super_block *sb)
2052{
2053 sb->s_dirt = 0;
2054
2055 if (!(sb->s_flags & MS_RDONLY)) {
2056 journal_t *journal = EXT3_SB(sb)->s_journal;
2057
2058 /* Now we set up the journal barrier. */
2059 journal_lock_updates(journal);
2060 journal_flush(journal);
2061
2062 /* Journal blocked and flushed, clear needs_recovery flag. */
2063 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2064 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2065 }
2066}
2067
2068/*
2069 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2070 * flag here, even though the filesystem is not technically dirty yet.
2071 */
2072static void ext3_unlockfs(struct super_block *sb)
2073{
2074 if (!(sb->s_flags & MS_RDONLY)) {
2075 lock_super(sb);
2076 /* Reser the needs_recovery flag before the fs is unlocked. */
2077 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2078 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2079 unlock_super(sb);
2080 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2081 }
2082}
2083
2084static int ext3_remount (struct super_block * sb, int * flags, char * data)
2085{
2086 struct ext3_super_block * es;
2087 struct ext3_sb_info *sbi = EXT3_SB(sb);
2088 unsigned long tmp;
2089 unsigned long n_blocks_count = 0;
2090
2091 /*
2092 * Allow the "check" option to be passed as a remount option.
2093 */
2094 if (!parse_options(data, sb, &tmp, &n_blocks_count, 1))
2095 return -EINVAL;
2096
2097 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
2098 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
2099
2100 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2101 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
2102
2103 es = sbi->s_es;
2104
2105 ext3_init_journal_params(sb, sbi->s_journal);
2106
2107 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2108 n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
2109 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
2110 return -EROFS;
2111
2112 if (*flags & MS_RDONLY) {
2113 /*
2114 * First of all, the unconditional stuff we have to do
2115 * to disable replay of the journal when we next remount
2116 */
2117 sb->s_flags |= MS_RDONLY;
2118
2119 /*
2120 * OK, test if we are remounting a valid rw partition
2121 * readonly, and if so set the rdonly flag and then
2122 * mark the partition as valid again.
2123 */
2124 if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
2125 (sbi->s_mount_state & EXT3_VALID_FS))
2126 es->s_state = cpu_to_le16(sbi->s_mount_state);
2127
2128 ext3_mark_recovery_complete(sb, es);
2129 } else {
2130 __le32 ret;
2131 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
2132 ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
2133 printk(KERN_WARNING "EXT3-fs: %s: couldn't "
2134 "remount RDWR because of unsupported "
2135 "optional features (%x).\n",
2136 sb->s_id, le32_to_cpu(ret));
2137 return -EROFS;
2138 }
2139 /*
2140 * Mounting a RDONLY partition read-write, so reread
2141 * and store the current valid flag. (It may have
2142 * been changed by e2fsck since we originally mounted
2143 * the partition.)
2144 */
2145 ext3_clear_journal_err(sb, es);
2146 sbi->s_mount_state = le16_to_cpu(es->s_state);
2147 if ((ret = ext3_group_extend(sb, es, n_blocks_count)))
2148 return ret;
2149 if (!ext3_setup_super (sb, es, 0))
2150 sb->s_flags &= ~MS_RDONLY;
2151 }
2152 }
2153 return 0;
2154}
2155
2156static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
2157{
2158 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
2159 unsigned long overhead;
2160 int i;
2161
2162 if (test_opt (sb, MINIX_DF))
2163 overhead = 0;
2164 else {
2165 unsigned long ngroups;
2166 ngroups = EXT3_SB(sb)->s_groups_count;
2167 smp_rmb();
2168
2169 /*
2170 * Compute the overhead (FS structures)
2171 */
2172
2173 /*
2174 * All of the blocks before first_data_block are
2175 * overhead
2176 */
2177 overhead = le32_to_cpu(es->s_first_data_block);
2178
2179 /*
2180 * Add the overhead attributed to the superblock and
2181 * block group descriptors. If the sparse superblocks
2182 * feature is turned on, then not all groups have this.
2183 */
2184 for (i = 0; i < ngroups; i++) {
2185 overhead += ext3_bg_has_super(sb, i) +
2186 ext3_bg_num_gdb(sb, i);
2187 cond_resched();
2188 }
2189
2190 /*
2191 * Every block group has an inode bitmap, a block
2192 * bitmap, and an inode table.
2193 */
2194 overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group));
2195 }
2196
2197 buf->f_type = EXT3_SUPER_MAGIC;
2198 buf->f_bsize = sb->s_blocksize;
2199 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
2200 buf->f_bfree = ext3_count_free_blocks (sb);
2201 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
2202 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
2203 buf->f_bavail = 0;
2204 buf->f_files = le32_to_cpu(es->s_inodes_count);
2205 buf->f_ffree = ext3_count_free_inodes (sb);
2206 buf->f_namelen = EXT3_NAME_LEN;
2207 return 0;
2208}
2209
2210/* Helper function for writing quotas on sync - we need to start transaction before quota file
2211 * is locked for write. Otherwise the are possible deadlocks:
2212 * Process 1 Process 2
2213 * ext3_create() quota_sync()
2214 * journal_start() write_dquot()
2215 * DQUOT_INIT() down(dqio_sem)
2216 * down(dqio_sem) journal_start()
2217 *
2218 */
2219
2220#ifdef CONFIG_QUOTA
2221
2222static inline struct inode *dquot_to_inode(struct dquot *dquot)
2223{
2224 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
2225}
2226
2227static int ext3_dquot_initialize(struct inode *inode, int type)
2228{
2229 handle_t *handle;
2230 int ret, err;
2231
2232 /* We may create quota structure so we need to reserve enough blocks */
2233 handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS);
2234 if (IS_ERR(handle))
2235 return PTR_ERR(handle);
2236 ret = dquot_initialize(inode, type);
2237 err = ext3_journal_stop(handle);
2238 if (!ret)
2239 ret = err;
2240 return ret;
2241}
2242
2243static int ext3_dquot_drop(struct inode *inode)
2244{
2245 handle_t *handle;
2246 int ret, err;
2247
2248 /* We may delete quota structure so we need to reserve enough blocks */
2249 handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS);
2250 if (IS_ERR(handle))
2251 return PTR_ERR(handle);
2252 ret = dquot_drop(inode);
2253 err = ext3_journal_stop(handle);
2254 if (!ret)
2255 ret = err;
2256 return ret;
2257}
2258
2259static int ext3_write_dquot(struct dquot *dquot)
2260{
2261 int ret, err;
2262 handle_t *handle;
2263 struct inode *inode;
2264
2265 inode = dquot_to_inode(dquot);
2266 handle = ext3_journal_start(inode,
2267 EXT3_QUOTA_TRANS_BLOCKS);
2268 if (IS_ERR(handle))
2269 return PTR_ERR(handle);
2270 ret = dquot_commit(dquot);
2271 err = ext3_journal_stop(handle);
2272 if (!ret)
2273 ret = err;
2274 return ret;
2275}
2276
2277static int ext3_acquire_dquot(struct dquot *dquot)
2278{
2279 int ret, err;
2280 handle_t *handle;
2281
2282 handle = ext3_journal_start(dquot_to_inode(dquot),
2283 EXT3_QUOTA_INIT_BLOCKS);
2284 if (IS_ERR(handle))
2285 return PTR_ERR(handle);
2286 ret = dquot_acquire(dquot);
2287 err = ext3_journal_stop(handle);
2288 if (!ret)
2289 ret = err;
2290 return ret;
2291}
2292
2293static int ext3_release_dquot(struct dquot *dquot)
2294{
2295 int ret, err;
2296 handle_t *handle;
2297
2298 handle = ext3_journal_start(dquot_to_inode(dquot),
2299 EXT3_QUOTA_INIT_BLOCKS);
2300 if (IS_ERR(handle))
2301 return PTR_ERR(handle);
2302 ret = dquot_release(dquot);
2303 err = ext3_journal_stop(handle);
2304 if (!ret)
2305 ret = err;
2306 return ret;
2307}
2308
2309static int ext3_mark_dquot_dirty(struct dquot *dquot)
2310{
2311 /* Are we journalling quotas? */
2312 if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
2313 EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
2314 dquot_mark_dquot_dirty(dquot);
2315 return ext3_write_dquot(dquot);
2316 } else {
2317 return dquot_mark_dquot_dirty(dquot);
2318 }
2319}
2320
2321static int ext3_write_info(struct super_block *sb, int type)
2322{
2323 int ret, err;
2324 handle_t *handle;
2325
2326 /* Data block + inode block */
2327 handle = ext3_journal_start(sb->s_root->d_inode, 2);
2328 if (IS_ERR(handle))
2329 return PTR_ERR(handle);
2330 ret = dquot_commit_info(sb, type);
2331 err = ext3_journal_stop(handle);
2332 if (!ret)
2333 ret = err;
2334 return ret;
2335}
2336
2337/*
2338 * Turn on quotas during mount time - we need to find
2339 * the quota file and such...
2340 */
2341static int ext3_quota_on_mount(struct super_block *sb, int type)
2342{
2343 int err;
2344 struct dentry *dentry;
2345 struct qstr name = { .name = EXT3_SB(sb)->s_qf_names[type],
2346 .hash = 0,
2347 .len = strlen(EXT3_SB(sb)->s_qf_names[type])};
2348
2349 dentry = lookup_hash(&name, sb->s_root);
2350 if (IS_ERR(dentry))
2351 return PTR_ERR(dentry);
2352 err = vfs_quota_on_mount(type, EXT3_SB(sb)->s_jquota_fmt, dentry);
2353 /* Now invalidate and put the dentry - quota got its own reference
2354 * to inode and dentry has at least wrong hash so we had better
2355 * throw it away */
2356 d_invalidate(dentry);
2357 dput(dentry);
2358 return err;
2359}
2360
2361/*
2362 * Standard function to be called on quota_on
2363 */
2364static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2365 char *path)
2366{
2367 int err;
2368 struct nameidata nd;
2369
2370 /* Not journalling quota? */
2371 if (!EXT3_SB(sb)->s_qf_names[USRQUOTA] &&
2372 !EXT3_SB(sb)->s_qf_names[GRPQUOTA])
2373 return vfs_quota_on(sb, type, format_id, path);
2374 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
2375 if (err)
2376 return err;
2377 /* Quotafile not on the same filesystem? */
2378 if (nd.mnt->mnt_sb != sb) {
2379 path_release(&nd);
2380 return -EXDEV;
2381 }
2382 /* Quotafile not of fs root? */
2383 if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
2384 printk(KERN_WARNING
2385 "EXT3-fs: Quota file not on filesystem root. "
2386 "Journalled quota will not work.\n");
2387 path_release(&nd);
2388 return vfs_quota_on(sb, type, format_id, path);
2389}
2390
2391/* Read data from quotafile - avoid pagecache and such because we cannot afford
2392 * acquiring the locks... As quota files are never truncated and quota code
2393 * itself serializes the operations (and noone else should touch the files)
2394 * we don't have to be afraid of races */
2395static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
2396 size_t len, loff_t off)
2397{
2398 struct inode *inode = sb_dqopt(sb)->files[type];
2399 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
2400 int err = 0;
2401 int offset = off & (sb->s_blocksize - 1);
2402 int tocopy;
2403 size_t toread;
2404 struct buffer_head *bh;
2405 loff_t i_size = i_size_read(inode);
2406
2407 if (off > i_size)
2408 return 0;
2409 if (off+len > i_size)
2410 len = i_size-off;
2411 toread = len;
2412 while (toread > 0) {
2413 tocopy = sb->s_blocksize - offset < toread ?
2414 sb->s_blocksize - offset : toread;
2415 bh = ext3_bread(NULL, inode, blk, 0, &err);
2416 if (err)
2417 return err;
2418 if (!bh) /* A hole? */
2419 memset(data, 0, tocopy);
2420 else
2421 memcpy(data, bh->b_data+offset, tocopy);
2422 brelse(bh);
2423 offset = 0;
2424 toread -= tocopy;
2425 data += tocopy;
2426 blk++;
2427 }
2428 return len;
2429}
2430
2431/* Write to quotafile (we know the transaction is already started and has
2432 * enough credits) */
2433static ssize_t ext3_quota_write(struct super_block *sb, int type,
2434 const char *data, size_t len, loff_t off)
2435{
2436 struct inode *inode = sb_dqopt(sb)->files[type];
2437 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
2438 int err = 0;
2439 int offset = off & (sb->s_blocksize - 1);
2440 int tocopy;
2441 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
2442 size_t towrite = len;
2443 struct buffer_head *bh;
2444 handle_t *handle = journal_current_handle();
2445
2446 down(&inode->i_sem);
2447 while (towrite > 0) {
2448 tocopy = sb->s_blocksize - offset < towrite ?
2449 sb->s_blocksize - offset : towrite;
2450 bh = ext3_bread(handle, inode, blk, 1, &err);
2451 if (!bh)
2452 goto out;
2453 if (journal_quota) {
2454 err = ext3_journal_get_write_access(handle, bh);
2455 if (err) {
2456 brelse(bh);
2457 goto out;
2458 }
2459 }
2460 lock_buffer(bh);
2461 memcpy(bh->b_data+offset, data, tocopy);
2462 flush_dcache_page(bh->b_page);
2463 unlock_buffer(bh);
2464 if (journal_quota)
2465 err = ext3_journal_dirty_metadata(handle, bh);
2466 else {
2467 /* Always do at least ordered writes for quotas */
2468 err = ext3_journal_dirty_data(handle, bh);
2469 mark_buffer_dirty(bh);
2470 }
2471 brelse(bh);
2472 if (err)
2473 goto out;
2474 offset = 0;
2475 towrite -= tocopy;
2476 data += tocopy;
2477 blk++;
2478 }
2479out:
2480 if (len == towrite)
2481 return err;
2482 if (inode->i_size < off+len-towrite) {
2483 i_size_write(inode, off+len-towrite);
2484 EXT3_I(inode)->i_disksize = inode->i_size;
2485 }
2486 inode->i_version++;
2487 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2488 ext3_mark_inode_dirty(handle, inode);
2489 up(&inode->i_sem);
2490 return len - towrite;
2491}
2492
2493#endif
2494
2495static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
2496 int flags, const char *dev_name, void *data)
2497{
2498 return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
2499}
2500
2501static struct file_system_type ext3_fs_type = {
2502 .owner = THIS_MODULE,
2503 .name = "ext3",
2504 .get_sb = ext3_get_sb,
2505 .kill_sb = kill_block_super,
2506 .fs_flags = FS_REQUIRES_DEV,
2507};
2508
2509static int __init init_ext3_fs(void)
2510{
2511 int err = init_ext3_xattr();
2512 if (err)
2513 return err;
2514 err = init_inodecache();
2515 if (err)
2516 goto out1;
2517 err = register_filesystem(&ext3_fs_type);
2518 if (err)
2519 goto out;
2520 return 0;
2521out:
2522 destroy_inodecache();
2523out1:
2524 exit_ext3_xattr();
2525 return err;
2526}
2527
2528static void __exit exit_ext3_fs(void)
2529{
2530 unregister_filesystem(&ext3_fs_type);
2531 destroy_inodecache();
2532 exit_ext3_xattr();
2533}
2534
2535MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
2536MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
2537MODULE_LICENSE("GPL");
2538module_init(init_ext3_fs)
2539module_exit(exit_ext3_fs)
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
new file mode 100644
index 000000000000..8c3e72818fb0
--- /dev/null
+++ b/fs/ext3/symlink.c
@@ -0,0 +1,54 @@
1/*
2 * linux/fs/ext3/symlink.c
3 *
4 * Only fast symlinks left here - the rest is done by generic code. AV, 1999
5 *
6 * Copyright (C) 1992, 1993, 1994, 1995
7 * Remy Card (card@masi.ibp.fr)
8 * Laboratoire MASI - Institut Blaise Pascal
9 * Universite Pierre et Marie Curie (Paris VI)
10 *
11 * from
12 *
13 * linux/fs/minix/symlink.c
14 *
15 * Copyright (C) 1991, 1992 Linus Torvalds
16 *
17 * ext3 symlink handling code
18 */
19
20#include <linux/fs.h>
21#include <linux/jbd.h>
22#include <linux/ext3_fs.h>
23#include <linux/namei.h>
24#include "xattr.h"
25
26static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
27{
28 struct ext3_inode_info *ei = EXT3_I(dentry->d_inode);
29 nd_set_link(nd, (char*)ei->i_data);
30 return 0;
31}
32
33struct inode_operations ext3_symlink_inode_operations = {
34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link,
37#ifdef CONFIG_EXT3_FS_XATTR
38 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr,
40 .listxattr = ext3_listxattr,
41 .removexattr = generic_removexattr,
42#endif
43};
44
45struct inode_operations ext3_fast_symlink_inode_operations = {
46 .readlink = generic_readlink,
47 .follow_link = ext3_follow_link,
48#ifdef CONFIG_EXT3_FS_XATTR
49 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr,
51 .listxattr = ext3_listxattr,
52 .removexattr = generic_removexattr,
53#endif
54};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
new file mode 100644
index 000000000000..4cbc6d0212d3
--- /dev/null
+++ b/fs/ext3/xattr.c
@@ -0,0 +1,1320 @@
1/*
2 * linux/fs/ext3/xattr.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 *
6 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
7 * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
8 * Extended attributes for symlinks and special files added per
9 * suggestion of Luka Renko <luka.renko@hermes.si>.
10 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
11 * Red Hat Inc.
12 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
13 * and Andreas Gruenbacher <agruen@suse.de>.
14 */
15
16/*
17 * Extended attributes are stored directly in inodes (on file systems with
18 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
19 * field contains the block number if an inode uses an additional block. All
20 * attributes must fit in the inode and one additional block. Blocks that
21 * contain the identical set of attributes may be shared among several inodes.
22 * Identical blocks are detected by keeping a cache of blocks that have
23 * recently been accessed.
24 *
25 * The attributes in inodes and on blocks have a different header; the entries
26 * are stored in the same format:
27 *
28 * +------------------+
29 * | header |
30 * | entry 1 | |
31 * | entry 2 | | growing downwards
32 * | entry 3 | v
33 * | four null bytes |
34 * | . . . |
35 * | value 1 | ^
36 * | value 3 | | growing upwards
37 * | value 2 | |
38 * +------------------+
39 *
40 * The header is followed by multiple entry descriptors. In disk blocks, the
41 * entry descriptors are kept sorted. In inodes, they are unsorted. The
42 * attribute values are aligned to the end of the block in no specific order.
43 *
44 * Locking strategy
45 * ----------------
46 * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
47 * EA blocks are only changed if they are exclusive to an inode, so
48 * holding xattr_sem also means that nothing but the EA block's reference
49 * count can change. Multiple writers to the same block are synchronized
50 * by the buffer lock.
51 */
52
53#include <linux/init.h>
54#include <linux/fs.h>
55#include <linux/slab.h>
56#include <linux/ext3_jbd.h>
57#include <linux/ext3_fs.h>
58#include <linux/mbcache.h>
59#include <linux/quotaops.h>
60#include <linux/rwsem.h>
61#include "xattr.h"
62#include "acl.h"
63
64#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
65#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
66#define BFIRST(bh) ENTRY(BHDR(bh)+1)
67#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
68
69#define IHDR(inode, raw_inode) \
70 ((struct ext3_xattr_ibody_header *) \
71 ((void *)raw_inode + \
72 EXT3_GOOD_OLD_INODE_SIZE + \
73 EXT3_I(inode)->i_extra_isize))
74#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1))
75
76#ifdef EXT3_XATTR_DEBUG
77# define ea_idebug(inode, f...) do { \
78 printk(KERN_DEBUG "inode %s:%ld: ", \
79 inode->i_sb->s_id, inode->i_ino); \
80 printk(f); \
81 printk("\n"); \
82 } while (0)
83# define ea_bdebug(bh, f...) do { \
84 char b[BDEVNAME_SIZE]; \
85 printk(KERN_DEBUG "block %s:%lu: ", \
86 bdevname(bh->b_bdev, b), \
87 (unsigned long) bh->b_blocknr); \
88 printk(f); \
89 printk("\n"); \
90 } while (0)
91#else
92# define ea_idebug(f...)
93# define ea_bdebug(f...)
94#endif
95
96static void ext3_xattr_cache_insert(struct buffer_head *);
97static struct buffer_head *ext3_xattr_cache_find(struct inode *,
98 struct ext3_xattr_header *,
99 struct mb_cache_entry **);
100static void ext3_xattr_rehash(struct ext3_xattr_header *,
101 struct ext3_xattr_entry *);
102
103static struct mb_cache *ext3_xattr_cache;
104
105static struct xattr_handler *ext3_xattr_handler_map[] = {
106 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
107#ifdef CONFIG_EXT3_FS_POSIX_ACL
108 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
109 [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler,
110#endif
111 [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler,
112#ifdef CONFIG_EXT3_FS_SECURITY
113 [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler,
114#endif
115};
116
117struct xattr_handler *ext3_xattr_handlers[] = {
118 &ext3_xattr_user_handler,
119 &ext3_xattr_trusted_handler,
120#ifdef CONFIG_EXT3_FS_POSIX_ACL
121 &ext3_xattr_acl_access_handler,
122 &ext3_xattr_acl_default_handler,
123#endif
124#ifdef CONFIG_EXT3_FS_SECURITY
125 &ext3_xattr_security_handler,
126#endif
127 NULL
128};
129
130static inline struct xattr_handler *
131ext3_xattr_handler(int name_index)
132{
133 struct xattr_handler *handler = NULL;
134
135 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
136 handler = ext3_xattr_handler_map[name_index];
137 return handler;
138}
139
140/*
141 * Inode operation listxattr()
142 *
143 * dentry->d_inode->i_sem: don't care
144 */
145ssize_t
146ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 return ext3_xattr_list(dentry->d_inode, buffer, size);
149}
150
151static int
152ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end)
153{
154 while (!IS_LAST_ENTRY(entry)) {
155 struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry);
156 if ((void *)next >= end)
157 return -EIO;
158 entry = next;
159 }
160 return 0;
161}
162
163static inline int
164ext3_xattr_check_block(struct buffer_head *bh)
165{
166 int error;
167
168 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
169 BHDR(bh)->h_blocks != cpu_to_le32(1))
170 return -EIO;
171 error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
172 return error;
173}
174
175static inline int
176ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size)
177{
178 size_t value_size = le32_to_cpu(entry->e_value_size);
179
180 if (entry->e_value_block != 0 || value_size > size ||
181 le16_to_cpu(entry->e_value_offs) + value_size > size)
182 return -EIO;
183 return 0;
184}
185
186static int
187ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
188 const char *name, size_t size, int sorted)
189{
190 struct ext3_xattr_entry *entry;
191 size_t name_len;
192 int cmp = 1;
193
194 if (name == NULL)
195 return -EINVAL;
196 name_len = strlen(name);
197 entry = *pentry;
198 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
199 cmp = name_index - entry->e_name_index;
200 if (!cmp)
201 cmp = name_len - entry->e_name_len;
202 if (!cmp)
203 cmp = memcmp(name, entry->e_name, name_len);
204 if (cmp <= 0 && (sorted || cmp == 0))
205 break;
206 }
207 *pentry = entry;
208 if (!cmp && ext3_xattr_check_entry(entry, size))
209 return -EIO;
210 return cmp ? -ENODATA : 0;
211}
212
213int
214ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
215 void *buffer, size_t buffer_size)
216{
217 struct buffer_head *bh = NULL;
218 struct ext3_xattr_entry *entry;
219 size_t size;
220 int error;
221
222 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
223 name_index, name, buffer, (long)buffer_size);
224
225 error = -ENODATA;
226 if (!EXT3_I(inode)->i_file_acl)
227 goto cleanup;
228 ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
229 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
230 if (!bh)
231 goto cleanup;
232 ea_bdebug(bh, "b_count=%d, refcount=%d",
233 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
234 if (ext3_xattr_check_block(bh)) {
235bad_block: ext3_error(inode->i_sb, __FUNCTION__,
236 "inode %ld: bad block %d", inode->i_ino,
237 EXT3_I(inode)->i_file_acl);
238 error = -EIO;
239 goto cleanup;
240 }
241 ext3_xattr_cache_insert(bh);
242 entry = BFIRST(bh);
243 error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
244 if (error == -EIO)
245 goto bad_block;
246 if (error)
247 goto cleanup;
248 size = le32_to_cpu(entry->e_value_size);
249 if (buffer) {
250 error = -ERANGE;
251 if (size > buffer_size)
252 goto cleanup;
253 memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
254 size);
255 }
256 error = size;
257
258cleanup:
259 brelse(bh);
260 return error;
261}
262
263static int
264ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
265 void *buffer, size_t buffer_size)
266{
267 struct ext3_xattr_ibody_header *header;
268 struct ext3_xattr_entry *entry;
269 struct ext3_inode *raw_inode;
270 struct ext3_iloc iloc;
271 size_t size;
272 void *end;
273 int error;
274
275 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
276 return -ENODATA;
277 error = ext3_get_inode_loc(inode, &iloc);
278 if (error)
279 return error;
280 raw_inode = ext3_raw_inode(&iloc);
281 header = IHDR(inode, raw_inode);
282 entry = IFIRST(header);
283 end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
284 error = ext3_xattr_check_names(entry, end);
285 if (error)
286 goto cleanup;
287 error = ext3_xattr_find_entry(&entry, name_index, name,
288 end - (void *)entry, 0);
289 if (error)
290 goto cleanup;
291 size = le32_to_cpu(entry->e_value_size);
292 if (buffer) {
293 error = -ERANGE;
294 if (size > buffer_size)
295 goto cleanup;
296 memcpy(buffer, (void *)IFIRST(header) +
297 le16_to_cpu(entry->e_value_offs), size);
298 }
299 error = size;
300
301cleanup:
302 brelse(iloc.bh);
303 return error;
304}
305
306/*
307 * ext3_xattr_get()
308 *
309 * Copy an extended attribute into the buffer
310 * provided, or compute the buffer size required.
311 * Buffer is NULL to compute the size of the buffer required.
312 *
313 * Returns a negative error number on failure, or the number of bytes
314 * used / required on success.
315 */
316int
317ext3_xattr_get(struct inode *inode, int name_index, const char *name,
318 void *buffer, size_t buffer_size)
319{
320 int error;
321
322 down_read(&EXT3_I(inode)->xattr_sem);
323 error = ext3_xattr_ibody_get(inode, name_index, name, buffer,
324 buffer_size);
325 if (error == -ENODATA)
326 error = ext3_xattr_block_get(inode, name_index, name, buffer,
327 buffer_size);
328 up_read(&EXT3_I(inode)->xattr_sem);
329 return error;
330}
331
332static int
333ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
334 char *buffer, size_t buffer_size)
335{
336 size_t rest = buffer_size;
337
338 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
339 struct xattr_handler *handler =
340 ext3_xattr_handler(entry->e_name_index);
341
342 if (handler) {
343 size_t size = handler->list(inode, buffer, rest,
344 entry->e_name,
345 entry->e_name_len);
346 if (buffer) {
347 if (size > rest)
348 return -ERANGE;
349 buffer += size;
350 }
351 rest -= size;
352 }
353 }
354 return buffer_size - rest;
355}
356
357int
358ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
359{
360 struct buffer_head *bh = NULL;
361 int error;
362
363 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
364 buffer, (long)buffer_size);
365
366 error = 0;
367 if (!EXT3_I(inode)->i_file_acl)
368 goto cleanup;
369 ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
370 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
371 error = -EIO;
372 if (!bh)
373 goto cleanup;
374 ea_bdebug(bh, "b_count=%d, refcount=%d",
375 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
376 if (ext3_xattr_check_block(bh)) {
377 ext3_error(inode->i_sb, __FUNCTION__,
378 "inode %ld: bad block %d", inode->i_ino,
379 EXT3_I(inode)->i_file_acl);
380 error = -EIO;
381 goto cleanup;
382 }
383 ext3_xattr_cache_insert(bh);
384 error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
385
386cleanup:
387 brelse(bh);
388
389 return error;
390}
391
392static int
393ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
394{
395 struct ext3_xattr_ibody_header *header;
396 struct ext3_inode *raw_inode;
397 struct ext3_iloc iloc;
398 void *end;
399 int error;
400
401 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
402 return 0;
403 error = ext3_get_inode_loc(inode, &iloc);
404 if (error)
405 return error;
406 raw_inode = ext3_raw_inode(&iloc);
407 header = IHDR(inode, raw_inode);
408 end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
409 error = ext3_xattr_check_names(IFIRST(header), end);
410 if (error)
411 goto cleanup;
412 error = ext3_xattr_list_entries(inode, IFIRST(header),
413 buffer, buffer_size);
414
415cleanup:
416 brelse(iloc.bh);
417 return error;
418}
419
420/*
421 * ext3_xattr_list()
422 *
423 * Copy a list of attribute names into the buffer
424 * provided, or compute the buffer size required.
425 * Buffer is NULL to compute the size of the buffer required.
426 *
427 * Returns a negative error number on failure, or the number of bytes
428 * used / required on success.
429 */
430int
431ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
432{
433 int i_error, b_error;
434
435 down_read(&EXT3_I(inode)->xattr_sem);
436 i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size);
437 if (i_error < 0) {
438 b_error = 0;
439 } else {
440 if (buffer) {
441 buffer += i_error;
442 buffer_size -= i_error;
443 }
444 b_error = ext3_xattr_block_list(inode, buffer, buffer_size);
445 if (b_error < 0)
446 i_error = 0;
447 }
448 up_read(&EXT3_I(inode)->xattr_sem);
449 return i_error + b_error;
450}
451
452/*
453 * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
454 * not set, set it.
455 */
456static void ext3_xattr_update_super_block(handle_t *handle,
457 struct super_block *sb)
458{
459 if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
460 return;
461
462 lock_super(sb);
463 if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
464 EXT3_SB(sb)->s_es->s_feature_compat |=
465 cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
466 sb->s_dirt = 1;
467 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
468 }
469 unlock_super(sb);
470}
471
472/*
473 * Release the xattr block BH: If the reference count is > 1, decrement
474 * it; otherwise free the block.
475 */
476static void
477ext3_xattr_release_block(handle_t *handle, struct inode *inode,
478 struct buffer_head *bh)
479{
480 struct mb_cache_entry *ce = NULL;
481
482 ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
483 if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
484 ea_bdebug(bh, "refcount now=0; freeing");
485 if (ce)
486 mb_cache_entry_free(ce);
487 ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
488 get_bh(bh);
489 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
490 } else {
491 if (ext3_journal_get_write_access(handle, bh) == 0) {
492 lock_buffer(bh);
493 BHDR(bh)->h_refcount = cpu_to_le32(
494 le32_to_cpu(BHDR(bh)->h_refcount) - 1);
495 ext3_journal_dirty_metadata(handle, bh);
496 if (IS_SYNC(inode))
497 handle->h_sync = 1;
498 DQUOT_FREE_BLOCK(inode, 1);
499 unlock_buffer(bh);
500 ea_bdebug(bh, "refcount now=%d; releasing",
501 le32_to_cpu(BHDR(bh)->h_refcount));
502 }
503 if (ce)
504 mb_cache_entry_release(ce);
505 }
506}
507
508struct ext3_xattr_info {
509 int name_index;
510 const char *name;
511 const void *value;
512 size_t value_len;
513};
514
515struct ext3_xattr_search {
516 struct ext3_xattr_entry *first;
517 void *base;
518 void *end;
519 struct ext3_xattr_entry *here;
520 int not_found;
521};
522
523static int
524ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s)
525{
526 struct ext3_xattr_entry *last;
527 size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
528
529 /* Compute min_offs and last. */
530 last = s->first;
531 for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) {
532 if (!last->e_value_block && last->e_value_size) {
533 size_t offs = le16_to_cpu(last->e_value_offs);
534 if (offs < min_offs)
535 min_offs = offs;
536 }
537 }
538 free = min_offs - ((void *)last - s->base) - sizeof(__u32);
539 if (!s->not_found) {
540 if (!s->here->e_value_block && s->here->e_value_size) {
541 size_t size = le32_to_cpu(s->here->e_value_size);
542 free += EXT3_XATTR_SIZE(size);
543 }
544 free += EXT3_XATTR_LEN(name_len);
545 }
546 if (i->value) {
547 if (free < EXT3_XATTR_SIZE(i->value_len) ||
548 free < EXT3_XATTR_LEN(name_len) +
549 EXT3_XATTR_SIZE(i->value_len))
550 return -ENOSPC;
551 }
552
553 if (i->value && s->not_found) {
554 /* Insert the new name. */
555 size_t size = EXT3_XATTR_LEN(name_len);
556 size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
557 memmove((void *)s->here + size, s->here, rest);
558 memset(s->here, 0, size);
559 s->here->e_name_index = i->name_index;
560 s->here->e_name_len = name_len;
561 memcpy(s->here->e_name, i->name, name_len);
562 } else {
563 if (!s->here->e_value_block && s->here->e_value_size) {
564 void *first_val = s->base + min_offs;
565 size_t offs = le16_to_cpu(s->here->e_value_offs);
566 void *val = s->base + offs;
567 size_t size = EXT3_XATTR_SIZE(
568 le32_to_cpu(s->here->e_value_size));
569
570 if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) {
571 /* The old and the new value have the same
572 size. Just replace. */
573 s->here->e_value_size =
574 cpu_to_le32(i->value_len);
575 memset(val + size - EXT3_XATTR_PAD, 0,
576 EXT3_XATTR_PAD); /* Clear pad bytes. */
577 memcpy(val, i->value, i->value_len);
578 return 0;
579 }
580
581 /* Remove the old value. */
582 memmove(first_val + size, first_val, val - first_val);
583 memset(first_val, 0, size);
584 s->here->e_value_size = 0;
585 s->here->e_value_offs = 0;
586 min_offs += size;
587
588 /* Adjust all value offsets. */
589 last = s->first;
590 while (!IS_LAST_ENTRY(last)) {
591 size_t o = le16_to_cpu(last->e_value_offs);
592 if (!last->e_value_block &&
593 last->e_value_size && o < offs)
594 last->e_value_offs =
595 cpu_to_le16(o + size);
596 last = EXT3_XATTR_NEXT(last);
597 }
598 }
599 if (!i->value) {
600 /* Remove the old name. */
601 size_t size = EXT3_XATTR_LEN(name_len);
602 last = ENTRY((void *)last - size);
603 memmove(s->here, (void *)s->here + size,
604 (void *)last - (void *)s->here + sizeof(__u32));
605 memset(last, 0, size);
606 }
607 }
608
609 if (i->value) {
610 /* Insert the new value. */
611 s->here->e_value_size = cpu_to_le32(i->value_len);
612 if (i->value_len) {
613 size_t size = EXT3_XATTR_SIZE(i->value_len);
614 void *val = s->base + min_offs - size;
615 s->here->e_value_offs = cpu_to_le16(min_offs - size);
616 memset(val + size - EXT3_XATTR_PAD, 0,
617 EXT3_XATTR_PAD); /* Clear the pad bytes. */
618 memcpy(val, i->value, i->value_len);
619 }
620 }
621 return 0;
622}
623
624struct ext3_xattr_block_find {
625 struct ext3_xattr_search s;
626 struct buffer_head *bh;
627};
628
629int
630ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
631 struct ext3_xattr_block_find *bs)
632{
633 struct super_block *sb = inode->i_sb;
634 int error;
635
636 ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
637 i->name_index, i->name, i->value, (long)i->value_len);
638
639 if (EXT3_I(inode)->i_file_acl) {
640 /* The inode already has an extended attribute block. */
641 bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
642 error = -EIO;
643 if (!bs->bh)
644 goto cleanup;
645 ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
646 atomic_read(&(bs->bh->b_count)),
647 le32_to_cpu(BHDR(bs->bh)->h_refcount));
648 if (ext3_xattr_check_block(bs->bh)) {
649 ext3_error(sb, __FUNCTION__,
650 "inode %ld: bad block %d", inode->i_ino,
651 EXT3_I(inode)->i_file_acl);
652 error = -EIO;
653 goto cleanup;
654 }
655 /* Find the named attribute. */
656 bs->s.base = BHDR(bs->bh);
657 bs->s.first = BFIRST(bs->bh);
658 bs->s.end = bs->bh->b_data + bs->bh->b_size;
659 bs->s.here = bs->s.first;
660 error = ext3_xattr_find_entry(&bs->s.here, i->name_index,
661 i->name, bs->bh->b_size, 1);
662 if (error && error != -ENODATA)
663 goto cleanup;
664 bs->s.not_found = error;
665 }
666 error = 0;
667
668cleanup:
669 return error;
670}
671
672static int
673ext3_xattr_block_set(handle_t *handle, struct inode *inode,
674 struct ext3_xattr_info *i,
675 struct ext3_xattr_block_find *bs)
676{
677 struct super_block *sb = inode->i_sb;
678 struct buffer_head *new_bh = NULL;
679 struct ext3_xattr_search *s = &bs->s;
680 struct mb_cache_entry *ce = NULL;
681 int error;
682
683#define header(x) ((struct ext3_xattr_header *)(x))
684
685 if (i->value && i->value_len > sb->s_blocksize)
686 return -ENOSPC;
687 if (s->base) {
688 ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
689 bs->bh->b_blocknr);
690 if (header(s->base)->h_refcount == cpu_to_le32(1)) {
691 if (ce) {
692 mb_cache_entry_free(ce);
693 ce = NULL;
694 }
695 ea_bdebug(bs->bh, "modifying in-place");
696 error = ext3_journal_get_write_access(handle, bs->bh);
697 if (error)
698 goto cleanup;
699 lock_buffer(bs->bh);
700 error = ext3_xattr_set_entry(i, s);
701 if (!error) {
702 if (!IS_LAST_ENTRY(s->first))
703 ext3_xattr_rehash(header(s->base),
704 s->here);
705 ext3_xattr_cache_insert(bs->bh);
706 }
707 unlock_buffer(bs->bh);
708 if (error == -EIO)
709 goto bad_block;
710 if (!error)
711 error = ext3_journal_dirty_metadata(handle,
712 bs->bh);
713 if (error)
714 goto cleanup;
715 goto inserted;
716 } else {
717 int offset = (char *)s->here - bs->bh->b_data;
718
719 if (ce) {
720 mb_cache_entry_release(ce);
721 ce = NULL;
722 }
723 ea_bdebug(bs->bh, "cloning");
724 s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
725 error = -ENOMEM;
726 if (s->base == NULL)
727 goto cleanup;
728 memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
729 s->first = ENTRY(header(s->base)+1);
730 header(s->base)->h_refcount = cpu_to_le32(1);
731 s->here = ENTRY(s->base + offset);
732 s->end = s->base + bs->bh->b_size;
733 }
734 } else {
735 /* Allocate a buffer where we construct the new block. */
736 s->base = kmalloc(sb->s_blocksize, GFP_KERNEL);
737 /* assert(header == s->base) */
738 error = -ENOMEM;
739 if (s->base == NULL)
740 goto cleanup;
741 memset(s->base, 0, sb->s_blocksize);
742 header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
743 header(s->base)->h_blocks = cpu_to_le32(1);
744 header(s->base)->h_refcount = cpu_to_le32(1);
745 s->first = ENTRY(header(s->base)+1);
746 s->here = ENTRY(header(s->base)+1);
747 s->end = s->base + sb->s_blocksize;
748 }
749
750 error = ext3_xattr_set_entry(i, s);
751 if (error == -EIO)
752 goto bad_block;
753 if (error)
754 goto cleanup;
755 if (!IS_LAST_ENTRY(s->first))
756 ext3_xattr_rehash(header(s->base), s->here);
757
758inserted:
759 if (!IS_LAST_ENTRY(s->first)) {
760 new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce);
761 if (new_bh) {
762 /* We found an identical block in the cache. */
763 if (new_bh == bs->bh)
764 ea_bdebug(new_bh, "keeping");
765 else {
766 /* The old block is released after updating
767 the inode. */
768 error = -EDQUOT;
769 if (DQUOT_ALLOC_BLOCK(inode, 1))
770 goto cleanup;
771 error = ext3_journal_get_write_access(handle,
772 new_bh);
773 if (error)
774 goto cleanup_dquot;
775 lock_buffer(new_bh);
776 BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
777 le32_to_cpu(BHDR(new_bh)->h_refcount));
778 ea_bdebug(new_bh, "reusing; refcount now=%d",
779 le32_to_cpu(BHDR(new_bh)->h_refcount));
780 unlock_buffer(new_bh);
781 error = ext3_journal_dirty_metadata(handle,
782 new_bh);
783 if (error)
784 goto cleanup_dquot;
785 }
786 mb_cache_entry_release(ce);
787 ce = NULL;
788 } else if (bs->bh && s->base == bs->bh->b_data) {
789 /* We were modifying this block in-place. */
790 ea_bdebug(bs->bh, "keeping this block");
791 new_bh = bs->bh;
792 get_bh(new_bh);
793 } else {
794 /* We need to allocate a new block */
795 int goal = le32_to_cpu(
796 EXT3_SB(sb)->s_es->s_first_data_block) +
797 EXT3_I(inode)->i_block_group *
798 EXT3_BLOCKS_PER_GROUP(sb);
799 int block = ext3_new_block(handle, inode, goal, &error);
800 if (error)
801 goto cleanup;
802 ea_idebug(inode, "creating block %d", block);
803
804 new_bh = sb_getblk(sb, block);
805 if (!new_bh) {
806getblk_failed:
807 ext3_free_blocks(handle, inode, block, 1);
808 error = -EIO;
809 goto cleanup;
810 }
811 lock_buffer(new_bh);
812 error = ext3_journal_get_create_access(handle, new_bh);
813 if (error) {
814 unlock_buffer(new_bh);
815 goto getblk_failed;
816 }
817 memcpy(new_bh->b_data, s->base, new_bh->b_size);
818 set_buffer_uptodate(new_bh);
819 unlock_buffer(new_bh);
820 ext3_xattr_cache_insert(new_bh);
821 error = ext3_journal_dirty_metadata(handle, new_bh);
822 if (error)
823 goto cleanup;
824 }
825 }
826
827 /* Update the inode. */
828 EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
829
830 /* Drop the previous xattr block. */
831 if (bs->bh && bs->bh != new_bh)
832 ext3_xattr_release_block(handle, inode, bs->bh);
833 error = 0;
834
835cleanup:
836 if (ce)
837 mb_cache_entry_release(ce);
838 brelse(new_bh);
839 if (!(bs->bh && s->base == bs->bh->b_data))
840 kfree(s->base);
841
842 return error;
843
844cleanup_dquot:
845 DQUOT_FREE_BLOCK(inode, 1);
846 goto cleanup;
847
848bad_block:
849 ext3_error(inode->i_sb, __FUNCTION__,
850 "inode %ld: bad block %d", inode->i_ino,
851 EXT3_I(inode)->i_file_acl);
852 goto cleanup;
853
854#undef header
855}
856
857struct ext3_xattr_ibody_find {
858 struct ext3_xattr_search s;
859 struct ext3_iloc iloc;
860};
861
862int
863ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
864 struct ext3_xattr_ibody_find *is)
865{
866 struct ext3_xattr_ibody_header *header;
867 struct ext3_inode *raw_inode;
868 int error;
869
870 if (EXT3_I(inode)->i_extra_isize == 0)
871 return 0;
872 raw_inode = ext3_raw_inode(&is->iloc);
873 header = IHDR(inode, raw_inode);
874 is->s.base = is->s.first = IFIRST(header);
875 is->s.here = is->s.first;
876 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
877 if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) {
878 error = ext3_xattr_check_names(IFIRST(header), is->s.end);
879 if (error)
880 return error;
881 /* Find the named attribute. */
882 error = ext3_xattr_find_entry(&is->s.here, i->name_index,
883 i->name, is->s.end -
884 (void *)is->s.base, 0);
885 if (error && error != -ENODATA)
886 return error;
887 is->s.not_found = error;
888 }
889 return 0;
890}
891
892static int
893ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
894 struct ext3_xattr_info *i,
895 struct ext3_xattr_ibody_find *is)
896{
897 struct ext3_xattr_ibody_header *header;
898 struct ext3_xattr_search *s = &is->s;
899 int error;
900
901 if (EXT3_I(inode)->i_extra_isize == 0)
902 return -ENOSPC;
903 error = ext3_xattr_set_entry(i, s);
904 if (error)
905 return error;
906 header = IHDR(inode, ext3_raw_inode(&is->iloc));
907 if (!IS_LAST_ENTRY(s->first)) {
908 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
909 EXT3_I(inode)->i_state |= EXT3_STATE_XATTR;
910 } else {
911 header->h_magic = cpu_to_le32(0);
912 EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR;
913 }
914 return 0;
915}
916
917/*
918 * ext3_xattr_set_handle()
919 *
920 * Create, replace or remove an extended attribute for this inode. Buffer
921 * is NULL to remove an existing extended attribute, and non-NULL to
922 * either replace an existing extended attribute, or create a new extended
923 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
924 * specify that an extended attribute must exist and must not exist
925 * previous to the call, respectively.
926 *
927 * Returns 0, or a negative error number on failure.
928 */
929int
930ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
931 const char *name, const void *value, size_t value_len,
932 int flags)
933{
934 struct ext3_xattr_info i = {
935 .name_index = name_index,
936 .name = name,
937 .value = value,
938 .value_len = value_len,
939
940 };
941 struct ext3_xattr_ibody_find is = {
942 .s = { .not_found = -ENODATA, },
943 };
944 struct ext3_xattr_block_find bs = {
945 .s = { .not_found = -ENODATA, },
946 };
947 int error;
948
949 if (IS_RDONLY(inode))
950 return -EROFS;
951 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
952 return -EPERM;
953 if (!name)
954 return -EINVAL;
955 if (strlen(name) > 255)
956 return -ERANGE;
957 down_write(&EXT3_I(inode)->xattr_sem);
958 error = ext3_get_inode_loc(inode, &is.iloc);
959 if (error)
960 goto cleanup;
961
962 if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
963 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
964 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
965 EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
966 }
967
968 error = ext3_xattr_ibody_find(inode, &i, &is);
969 if (error)
970 goto cleanup;
971 if (is.s.not_found)
972 error = ext3_xattr_block_find(inode, &i, &bs);
973 if (error)
974 goto cleanup;
975 if (is.s.not_found && bs.s.not_found) {
976 error = -ENODATA;
977 if (flags & XATTR_REPLACE)
978 goto cleanup;
979 error = 0;
980 if (!value)
981 goto cleanup;
982 } else {
983 error = -EEXIST;
984 if (flags & XATTR_CREATE)
985 goto cleanup;
986 }
987 error = ext3_journal_get_write_access(handle, is.iloc.bh);
988 if (error)
989 goto cleanup;
990 if (!value) {
991 if (!is.s.not_found)
992 error = ext3_xattr_ibody_set(handle, inode, &i, &is);
993 else if (!bs.s.not_found)
994 error = ext3_xattr_block_set(handle, inode, &i, &bs);
995 } else {
996 error = ext3_xattr_ibody_set(handle, inode, &i, &is);
997 if (!error && !bs.s.not_found) {
998 i.value = NULL;
999 error = ext3_xattr_block_set(handle, inode, &i, &bs);
1000 } else if (error == -ENOSPC) {
1001 error = ext3_xattr_block_set(handle, inode, &i, &bs);
1002 if (error)
1003 goto cleanup;
1004 if (!is.s.not_found) {
1005 i.value = NULL;
1006 error = ext3_xattr_ibody_set(handle, inode, &i,
1007 &is);
1008 }
1009 }
1010 }
1011 if (!error) {
1012 ext3_xattr_update_super_block(handle, inode->i_sb);
1013 inode->i_ctime = CURRENT_TIME_SEC;
1014 error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
1015 /*
1016 * The bh is consumed by ext3_mark_iloc_dirty, even with
1017 * error != 0.
1018 */
1019 is.iloc.bh = NULL;
1020 if (IS_SYNC(inode))
1021 handle->h_sync = 1;
1022 }
1023
1024cleanup:
1025 brelse(is.iloc.bh);
1026 brelse(bs.bh);
1027 up_write(&EXT3_I(inode)->xattr_sem);
1028 return error;
1029}
1030
1031/*
1032 * ext3_xattr_set()
1033 *
1034 * Like ext3_xattr_set_handle, but start from an inode. This extended
1035 * attribute modification is a filesystem transaction by itself.
1036 *
1037 * Returns 0, or a negative error number on failure.
1038 */
1039int
1040ext3_xattr_set(struct inode *inode, int name_index, const char *name,
1041 const void *value, size_t value_len, int flags)
1042{
1043 handle_t *handle;
1044 int error, retries = 0;
1045
1046retry:
1047 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS);
1048 if (IS_ERR(handle)) {
1049 error = PTR_ERR(handle);
1050 } else {
1051 int error2;
1052
1053 error = ext3_xattr_set_handle(handle, inode, name_index, name,
1054 value, value_len, flags);
1055 error2 = ext3_journal_stop(handle);
1056 if (error == -ENOSPC &&
1057 ext3_should_retry_alloc(inode->i_sb, &retries))
1058 goto retry;
1059 if (error == 0)
1060 error = error2;
1061 }
1062
1063 return error;
1064}
1065
1066/*
1067 * ext3_xattr_delete_inode()
1068 *
1069 * Free extended attribute resources associated with this inode. This
1070 * is called immediately before an inode is freed. We have exclusive
1071 * access to the inode.
1072 */
1073void
1074ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
1075{
1076 struct buffer_head *bh = NULL;
1077
1078 if (!EXT3_I(inode)->i_file_acl)
1079 goto cleanup;
1080 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
1081 if (!bh) {
1082 ext3_error(inode->i_sb, __FUNCTION__,
1083 "inode %ld: block %d read error", inode->i_ino,
1084 EXT3_I(inode)->i_file_acl);
1085 goto cleanup;
1086 }
1087 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
1088 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1089 ext3_error(inode->i_sb, __FUNCTION__,
1090 "inode %ld: bad block %d", inode->i_ino,
1091 EXT3_I(inode)->i_file_acl);
1092 goto cleanup;
1093 }
1094 ext3_xattr_release_block(handle, inode, bh);
1095 EXT3_I(inode)->i_file_acl = 0;
1096
1097cleanup:
1098 brelse(bh);
1099}
1100
1101/*
1102 * ext3_xattr_put_super()
1103 *
1104 * This is called when a file system is unmounted.
1105 */
1106void
1107ext3_xattr_put_super(struct super_block *sb)
1108{
1109 mb_cache_shrink(ext3_xattr_cache, sb->s_bdev);
1110}
1111
1112/*
1113 * ext3_xattr_cache_insert()
1114 *
1115 * Create a new entry in the extended attribute cache, and insert
1116 * it unless such an entry is already in the cache.
1117 *
1118 * Returns 0, or a negative error number on failure.
1119 */
1120static void
1121ext3_xattr_cache_insert(struct buffer_head *bh)
1122{
1123 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1124 struct mb_cache_entry *ce;
1125 int error;
1126
1127 ce = mb_cache_entry_alloc(ext3_xattr_cache);
1128 if (!ce) {
1129 ea_bdebug(bh, "out of memory");
1130 return;
1131 }
1132 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
1133 if (error) {
1134 mb_cache_entry_free(ce);
1135 if (error == -EBUSY) {
1136 ea_bdebug(bh, "already in cache");
1137 error = 0;
1138 }
1139 } else {
1140 ea_bdebug(bh, "inserting [%x]", (int)hash);
1141 mb_cache_entry_release(ce);
1142 }
1143}
1144
1145/*
1146 * ext3_xattr_cmp()
1147 *
1148 * Compare two extended attribute blocks for equality.
1149 *
1150 * Returns 0 if the blocks are equal, 1 if they differ, and
1151 * a negative error number on errors.
1152 */
1153static int
1154ext3_xattr_cmp(struct ext3_xattr_header *header1,
1155 struct ext3_xattr_header *header2)
1156{
1157 struct ext3_xattr_entry *entry1, *entry2;
1158
1159 entry1 = ENTRY(header1+1);
1160 entry2 = ENTRY(header2+1);
1161 while (!IS_LAST_ENTRY(entry1)) {
1162 if (IS_LAST_ENTRY(entry2))
1163 return 1;
1164 if (entry1->e_hash != entry2->e_hash ||
1165 entry1->e_name_index != entry2->e_name_index ||
1166 entry1->e_name_len != entry2->e_name_len ||
1167 entry1->e_value_size != entry2->e_value_size ||
1168 memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
1169 return 1;
1170 if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
1171 return -EIO;
1172 if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
1173 (char *)header2 + le16_to_cpu(entry2->e_value_offs),
1174 le32_to_cpu(entry1->e_value_size)))
1175 return 1;
1176
1177 entry1 = EXT3_XATTR_NEXT(entry1);
1178 entry2 = EXT3_XATTR_NEXT(entry2);
1179 }
1180 if (!IS_LAST_ENTRY(entry2))
1181 return 1;
1182 return 0;
1183}
1184
1185/*
1186 * ext3_xattr_cache_find()
1187 *
1188 * Find an identical extended attribute block.
1189 *
1190 * Returns a pointer to the block found, or NULL if such a block was
1191 * not found or an error occurred.
1192 */
1193static struct buffer_head *
1194ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
1195 struct mb_cache_entry **pce)
1196{
1197 __u32 hash = le32_to_cpu(header->h_hash);
1198 struct mb_cache_entry *ce;
1199
1200 if (!header->h_hash)
1201 return NULL; /* never share */
1202 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1203again:
1204 ce = mb_cache_entry_find_first(ext3_xattr_cache, 0,
1205 inode->i_sb->s_bdev, hash);
1206 while (ce) {
1207 struct buffer_head *bh;
1208
1209 if (IS_ERR(ce)) {
1210 if (PTR_ERR(ce) == -EAGAIN)
1211 goto again;
1212 break;
1213 }
1214 bh = sb_bread(inode->i_sb, ce->e_block);
1215 if (!bh) {
1216 ext3_error(inode->i_sb, __FUNCTION__,
1217 "inode %ld: block %ld read error",
1218 inode->i_ino, (unsigned long) ce->e_block);
1219 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1220 EXT3_XATTR_REFCOUNT_MAX) {
1221 ea_idebug(inode, "block %ld refcount %d>=%d",
1222 (unsigned long) ce->e_block,
1223 le32_to_cpu(BHDR(bh)->h_refcount),
1224 EXT3_XATTR_REFCOUNT_MAX);
1225 } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) {
1226 *pce = ce;
1227 return bh;
1228 }
1229 brelse(bh);
1230 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
1231 }
1232 return NULL;
1233}
1234
1235#define NAME_HASH_SHIFT 5
1236#define VALUE_HASH_SHIFT 16
1237
1238/*
1239 * ext3_xattr_hash_entry()
1240 *
1241 * Compute the hash of an extended attribute.
1242 */
1243static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
1244 struct ext3_xattr_entry *entry)
1245{
1246 __u32 hash = 0;
1247 char *name = entry->e_name;
1248 int n;
1249
1250 for (n=0; n < entry->e_name_len; n++) {
1251 hash = (hash << NAME_HASH_SHIFT) ^
1252 (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
1253 *name++;
1254 }
1255
1256 if (entry->e_value_block == 0 && entry->e_value_size != 0) {
1257 __le32 *value = (__le32 *)((char *)header +
1258 le16_to_cpu(entry->e_value_offs));
1259 for (n = (le32_to_cpu(entry->e_value_size) +
1260 EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
1261 hash = (hash << VALUE_HASH_SHIFT) ^
1262 (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
1263 le32_to_cpu(*value++);
1264 }
1265 }
1266 entry->e_hash = cpu_to_le32(hash);
1267}
1268
1269#undef NAME_HASH_SHIFT
1270#undef VALUE_HASH_SHIFT
1271
1272#define BLOCK_HASH_SHIFT 16
1273
1274/*
1275 * ext3_xattr_rehash()
1276 *
1277 * Re-compute the extended attribute hash value after an entry has changed.
1278 */
1279static void ext3_xattr_rehash(struct ext3_xattr_header *header,
1280 struct ext3_xattr_entry *entry)
1281{
1282 struct ext3_xattr_entry *here;
1283 __u32 hash = 0;
1284
1285 ext3_xattr_hash_entry(header, entry);
1286 here = ENTRY(header+1);
1287 while (!IS_LAST_ENTRY(here)) {
1288 if (!here->e_hash) {
1289 /* Block is not shared if an entry's hash value == 0 */
1290 hash = 0;
1291 break;
1292 }
1293 hash = (hash << BLOCK_HASH_SHIFT) ^
1294 (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
1295 le32_to_cpu(here->e_hash);
1296 here = EXT3_XATTR_NEXT(here);
1297 }
1298 header->h_hash = cpu_to_le32(hash);
1299}
1300
1301#undef BLOCK_HASH_SHIFT
1302
1303int __init
1304init_ext3_xattr(void)
1305{
1306 ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
1307 sizeof(struct mb_cache_entry) +
1308 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1309 if (!ext3_xattr_cache)
1310 return -ENOMEM;
1311 return 0;
1312}
1313
1314void
1315exit_ext3_xattr(void)
1316{
1317 if (ext3_xattr_cache)
1318 mb_cache_destroy(ext3_xattr_cache);
1319 ext3_xattr_cache = NULL;
1320}
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
new file mode 100644
index 000000000000..eb31a69e82dc
--- /dev/null
+++ b/fs/ext3/xattr.h
@@ -0,0 +1,135 @@
1/*
2 File: fs/ext3/xattr.h
3
4 On-disk format of extended attributes for the ext3 filesystem.
5
6 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
7*/
8
9#include <linux/config.h>
10#include <linux/xattr.h>
11
12/* Magic value in attribute blocks */
13#define EXT3_XATTR_MAGIC 0xEA020000
14
15/* Maximum number of references to one attribute block */
16#define EXT3_XATTR_REFCOUNT_MAX 1024
17
18/* Name indexes */
19#define EXT3_XATTR_INDEX_USER 1
20#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2
21#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3
22#define EXT3_XATTR_INDEX_TRUSTED 4
23#define EXT3_XATTR_INDEX_LUSTRE 5
24#define EXT3_XATTR_INDEX_SECURITY 6
25
26struct ext3_xattr_header {
27 __le32 h_magic; /* magic number for identification */
28 __le32 h_refcount; /* reference count */
29 __le32 h_blocks; /* number of disk blocks used */
30 __le32 h_hash; /* hash value of all attributes */
31 __u32 h_reserved[4]; /* zero right now */
32};
33
34struct ext3_xattr_ibody_header {
35 __le32 h_magic; /* magic number for identification */
36};
37
38struct ext3_xattr_entry {
39 __u8 e_name_len; /* length of name */
40 __u8 e_name_index; /* attribute name index */
41 __le16 e_value_offs; /* offset in disk block of value */
42 __le32 e_value_block; /* disk block attribute is stored on (n/i) */
43 __le32 e_value_size; /* size of attribute value */
44 __le32 e_hash; /* hash value of name and value */
45 char e_name[0]; /* attribute name */
46};
47
48#define EXT3_XATTR_PAD_BITS 2
49#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS)
50#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1)
51#define EXT3_XATTR_LEN(name_len) \
52 (((name_len) + EXT3_XATTR_ROUND + \
53 sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
54#define EXT3_XATTR_NEXT(entry) \
55 ( (struct ext3_xattr_entry *)( \
56 (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
57#define EXT3_XATTR_SIZE(size) \
58 (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
59
60# ifdef CONFIG_EXT3_FS_XATTR
61
62extern struct xattr_handler ext3_xattr_user_handler;
63extern struct xattr_handler ext3_xattr_trusted_handler;
64extern struct xattr_handler ext3_xattr_acl_access_handler;
65extern struct xattr_handler ext3_xattr_acl_default_handler;
66extern struct xattr_handler ext3_xattr_security_handler;
67
68extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
69
70extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
71extern int ext3_xattr_list(struct inode *, char *, size_t);
72extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
73extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
74
75extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
76extern void ext3_xattr_put_super(struct super_block *);
77
78extern int init_ext3_xattr(void);
79extern void exit_ext3_xattr(void);
80
81extern struct xattr_handler *ext3_xattr_handlers[];
82
83# else /* CONFIG_EXT3_FS_XATTR */
84
85static inline int
86ext3_xattr_get(struct inode *inode, int name_index, const char *name,
87 void *buffer, size_t size, int flags)
88{
89 return -EOPNOTSUPP;
90}
91
92static inline int
93ext3_xattr_list(struct inode *inode, void *buffer, size_t size)
94{
95 return -EOPNOTSUPP;
96}
97
98static inline int
99ext3_xattr_set(struct inode *inode, int name_index, const char *name,
100 const void *value, size_t size, int flags)
101{
102 return -EOPNOTSUPP;
103}
104
105static inline int
106ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
107 const char *name, const void *value, size_t size, int flags)
108{
109 return -EOPNOTSUPP;
110}
111
112static inline void
113ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
114{
115}
116
117static inline void
118ext3_xattr_put_super(struct super_block *sb)
119{
120}
121
122static inline int
123init_ext3_xattr(void)
124{
125 return 0;
126}
127
128static inline void
129exit_ext3_xattr(void)
130{
131}
132
133#define ext3_xattr_handlers NULL
134
135# endif /* CONFIG_EXT3_FS_XATTR */
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
new file mode 100644
index 000000000000..ddc1c41750e1
--- /dev/null
+++ b/fs/ext3/xattr_security.c
@@ -0,0 +1,55 @@
1/*
2 * linux/fs/ext3/xattr_security.c
3 * Handler for storing security labels as extended attributes.
4 */
5
6#include <linux/module.h>
7#include <linux/string.h>
8#include <linux/fs.h>
9#include <linux/smp_lock.h>
10#include <linux/ext3_jbd.h>
11#include <linux/ext3_fs.h>
12#include "xattr.h"
13
14static size_t
15ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
16 const char *name, size_t name_len)
17{
18 const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
19 const size_t total_len = prefix_len + name_len + 1;
20
21
22 if (list && total_len <= list_size) {
23 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
24 memcpy(list+prefix_len, name, name_len);
25 list[prefix_len + name_len] = '\0';
26 }
27 return total_len;
28}
29
30static int
31ext3_xattr_security_get(struct inode *inode, const char *name,
32 void *buffer, size_t size)
33{
34 if (strcmp(name, "") == 0)
35 return -EINVAL;
36 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name,
37 buffer, size);
38}
39
40static int
41ext3_xattr_security_set(struct inode *inode, const char *name,
42 const void *value, size_t size, int flags)
43{
44 if (strcmp(name, "") == 0)
45 return -EINVAL;
46 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name,
47 value, size, flags);
48}
49
50struct xattr_handler ext3_xattr_security_handler = {
51 .prefix = XATTR_SECURITY_PREFIX,
52 .list = ext3_xattr_security_list,
53 .get = ext3_xattr_security_get,
54 .set = ext3_xattr_security_set,
55};
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
new file mode 100644
index 000000000000..f68bfd1cf519
--- /dev/null
+++ b/fs/ext3/xattr_trusted.c
@@ -0,0 +1,65 @@
1/*
2 * linux/fs/ext3/xattr_trusted.c
3 * Handler for trusted extended attributes.
4 *
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/fs.h>
11#include <linux/smp_lock.h>
12#include <linux/ext3_jbd.h>
13#include <linux/ext3_fs.h>
14#include "xattr.h"
15
16#define XATTR_TRUSTED_PREFIX "trusted."
17
18static size_t
19ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
20 const char *name, size_t name_len)
21{
22 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
23 const size_t total_len = prefix_len + name_len + 1;
24
25 if (!capable(CAP_SYS_ADMIN))
26 return 0;
27
28 if (list && total_len <= list_size) {
29 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
30 memcpy(list+prefix_len, name, name_len);
31 list[prefix_len + name_len] = '\0';
32 }
33 return total_len;
34}
35
36static int
37ext3_xattr_trusted_get(struct inode *inode, const char *name,
38 void *buffer, size_t size)
39{
40 if (strcmp(name, "") == 0)
41 return -EINVAL;
42 if (!capable(CAP_SYS_ADMIN))
43 return -EPERM;
44 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name,
45 buffer, size);
46}
47
48static int
49ext3_xattr_trusted_set(struct inode *inode, const char *name,
50 const void *value, size_t size, int flags)
51{
52 if (strcmp(name, "") == 0)
53 return -EINVAL;
54 if (!capable(CAP_SYS_ADMIN))
55 return -EPERM;
56 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name,
57 value, size, flags);
58}
59
60struct xattr_handler ext3_xattr_trusted_handler = {
61 .prefix = XATTR_TRUSTED_PREFIX,
62 .list = ext3_xattr_trusted_list,
63 .get = ext3_xattr_trusted_get,
64 .set = ext3_xattr_trusted_set,
65};
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
new file mode 100644
index 000000000000..e907cae7a07c
--- /dev/null
+++ b/fs/ext3/xattr_user.c
@@ -0,0 +1,79 @@
1/*
2 * linux/fs/ext3/xattr_user.c
3 * Handler for extended user attributes.
4 *
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/fs.h>
11#include <linux/smp_lock.h>
12#include <linux/ext3_jbd.h>
13#include <linux/ext3_fs.h>
14#include "xattr.h"
15
16#define XATTR_USER_PREFIX "user."
17
18static size_t
19ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
20 const char *name, size_t name_len)
21{
22 const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
23 const size_t total_len = prefix_len + name_len + 1;
24
25 if (!test_opt(inode->i_sb, XATTR_USER))
26 return 0;
27
28 if (list && total_len <= list_size) {
29 memcpy(list, XATTR_USER_PREFIX, prefix_len);
30 memcpy(list+prefix_len, name, name_len);
31 list[prefix_len + name_len] = '\0';
32 }
33 return total_len;
34}
35
36static int
37ext3_xattr_user_get(struct inode *inode, const char *name,
38 void *buffer, size_t size)
39{
40 int error;
41
42 if (strcmp(name, "") == 0)
43 return -EINVAL;
44 if (!test_opt(inode->i_sb, XATTR_USER))
45 return -EOPNOTSUPP;
46 error = permission(inode, MAY_READ, NULL);
47 if (error)
48 return error;
49
50 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size);
51}
52
53static int
54ext3_xattr_user_set(struct inode *inode, const char *name,
55 const void *value, size_t size, int flags)
56{
57 int error;
58
59 if (strcmp(name, "") == 0)
60 return -EINVAL;
61 if (!test_opt(inode->i_sb, XATTR_USER))
62 return -EOPNOTSUPP;
63 if ( !S_ISREG(inode->i_mode) &&
64 (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
65 return -EPERM;
66 error = permission(inode, MAY_WRITE, NULL);
67 if (error)
68 return error;
69
70 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name,
71 value, size, flags);
72}
73
74struct xattr_handler ext3_xattr_user_handler = {
75 .prefix = XATTR_USER_PREFIX,
76 .list = ext3_xattr_user_list,
77 .get = ext3_xattr_user_get,
78 .set = ext3_xattr_user_set,
79};