aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c29
-rw-r--r--fs/autofs4/expire.c27
-rw-r--r--fs/autofs4/root.c41
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/btrfs_inode.h31
-rw-r--r--fs/btrfs/ctree.c588
-rw-r--r--fs/btrfs/ctree.h71
-rw-r--r--fs/btrfs/delayed-ref.c669
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c81
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c1674
-rw-r--r--fs/btrfs/extent_io.c51
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c50
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode.c206
-rw-r--r--fs/btrfs/locking.c21
-rw-r--r--fs/btrfs/ordered-data.c118
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/transaction.c151
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c444
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/buffer.c56
-rw-r--r--fs/ecryptfs/keystore.c3
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/eventfd.c26
-rw-r--r--fs/eventpoll.c614
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h93
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h6
-rw-r--r--fs/ext4/ext4_sb.h14
-rw-r--r--fs/ext4/extents.c127
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/ialloc.c273
-rw-r--r--fs/ext4/inode.c429
-rw-r--r--fs/ext4/ioctl.c17
-rw-r--r--fs/ext4/mballoc.c158
-rw-r--r--fs/ext4/mballoc.h8
-rw-r--r--fs/ext4/namei.c164
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c327
-rw-r--r--fs/fuse/file.c3
-rw-r--r--fs/gfs2/ops_file.c5
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--fs/jbd2/commit.c5
-rw-r--r--fs/jbd2/revoke.c24
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/lockd/clntlock.c51
-rw-r--r--fs/lockd/mon.c8
-rw-r--r--fs/lockd/svc.c42
-rw-r--r--fs/nfs/callback.c31
-rw-r--r--fs/nfs/callback.h1
-rw-r--r--fs/nfs/client.c116
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/file.c37
-rw-r--r--fs/nfs/getroot.c4
-rw-r--r--fs/nfs/inode.c309
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/nfs2xdr.c9
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c37
-rw-r--r--fs/nfs/nfs4proc.c47
-rw-r--r--fs/nfs/nfs4state.c10
-rw-r--r--fs/nfs/nfs4xdr.c213
-rw-r--r--fs/nfs/pagelist.c11
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/nfsd/nfsctl.c6
-rw-r--r--fs/nfsd/nfssvc.c5
-rw-r--r--fs/ntfs/dir.c4
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ntfs/layout.h329
-rw-r--r--fs/ntfs/logfile.h6
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/ntfs/super.c50
-rw-r--r--fs/ntfs/usnjrnl.h48
-rw-r--r--fs/ocfs2/mmap.c6
-rw-r--r--fs/proc/proc_tty.c12
-rw-r--r--fs/ramfs/file-nommu.c15
-rw-r--r--fs/ramfs/inode.c94
-rw-r--r--fs/sysfs/bin.c8
-rw-r--r--fs/ubifs/file.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c4
92 files changed, 5259 insertions, 3261 deletions
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a76803108d0..b7ff33c6310 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry);
186int autofs4_expire_run(struct super_block *, struct vfsmount *, 186int autofs4_expire_run(struct super_block *, struct vfsmount *,
187 struct autofs_sb_info *, 187 struct autofs_sb_info *,
188 struct autofs_packet_expire __user *); 188 struct autofs_packet_expire __user *);
189int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
190 struct autofs_sb_info *sbi, int when);
189int autofs4_expire_multi(struct super_block *, struct vfsmount *, 191int autofs4_expire_multi(struct super_block *, struct vfsmount *,
190 struct autofs_sb_info *, int __user *); 192 struct autofs_sb_info *, int __user *);
191struct dentry *autofs4_expire_direct(struct super_block *sb, 193struct dentry *autofs4_expire_direct(struct super_block *sb,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 025e105bffe..9e5ae8a4f5c 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -525,40 +525,13 @@ static int autofs_dev_ioctl_expire(struct file *fp,
525 struct autofs_sb_info *sbi, 525 struct autofs_sb_info *sbi,
526 struct autofs_dev_ioctl *param) 526 struct autofs_dev_ioctl *param)
527{ 527{
528 struct dentry *dentry;
529 struct vfsmount *mnt; 528 struct vfsmount *mnt;
530 int err = -EAGAIN;
531 int how; 529 int how;
532 530
533 how = param->expire.how; 531 how = param->expire.how;
534 mnt = fp->f_path.mnt; 532 mnt = fp->f_path.mnt;
535 533
536 if (autofs_type_trigger(sbi->type)) 534 return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
537 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
538 else
539 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
540
541 if (dentry) {
542 struct autofs_info *ino = autofs4_dentry_ino(dentry);
543
544 /*
545 * This is synchronous because it makes the daemon a
546 * little easier
547 */
548 err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
549
550 spin_lock(&sbi->fs_lock);
551 if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
552 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
553 sbi->sb->s_root->d_mounted++;
554 }
555 ino->flags &= ~AUTOFS_INF_EXPIRING;
556 complete_all(&ino->expire_complete);
557 spin_unlock(&sbi->fs_lock);
558 dput(dentry);
559 }
560
561 return err;
562} 535}
563 536
564/* Check if autofs mount point is in use */ 537/* Check if autofs mount point is in use */
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index e3bd50776f9..75f7ddacf7d 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -478,22 +478,16 @@ int autofs4_expire_run(struct super_block *sb,
478 return ret; 478 return ret;
479} 479}
480 480
481/* Call repeatedly until it returns -EAGAIN, meaning there's nothing 481int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
482 more to be done */ 482 struct autofs_sb_info *sbi, int when)
483int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
484 struct autofs_sb_info *sbi, int __user *arg)
485{ 483{
486 struct dentry *dentry; 484 struct dentry *dentry;
487 int ret = -EAGAIN; 485 int ret = -EAGAIN;
488 int do_now = 0;
489
490 if (arg && get_user(do_now, arg))
491 return -EFAULT;
492 486
493 if (autofs_type_trigger(sbi->type)) 487 if (autofs_type_trigger(sbi->type))
494 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); 488 dentry = autofs4_expire_direct(sb, mnt, sbi, when);
495 else 489 else
496 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); 490 dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
497 491
498 if (dentry) { 492 if (dentry) {
499 struct autofs_info *ino = autofs4_dentry_ino(dentry); 493 struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -516,3 +510,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
516 return ret; 510 return ret;
517} 511}
518 512
513/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
514 more to be done */
515int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
516 struct autofs_sb_info *sbi, int __user *arg)
517{
518 int do_now = 0;
519
520 if (arg && get_user(do_now, arg))
521 return -EFAULT;
522
523 return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
524}
525
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 74b1469a950..e383bf0334f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
485 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 485 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
486 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 486 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
487 487
488 expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
489 if (expiring) {
490 /*
491 * If we are racing with expire the request might not
492 * be quite complete but the directory has been removed
493 * so it must have been successful, so just wait for it.
494 */
495 ino = autofs4_dentry_ino(expiring);
496 autofs4_expire_wait(expiring);
497 spin_lock(&sbi->lookup_lock);
498 if (!list_empty(&ino->expiring))
499 list_del_init(&ino->expiring);
500 spin_unlock(&sbi->lookup_lock);
501 dput(expiring);
502 }
503
504 unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name); 488 unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
505 if (unhashed) 489 if (unhashed)
506 dentry = unhashed; 490 dentry = unhashed;
@@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
538 } 522 }
539 523
540 if (!oz_mode) { 524 if (!oz_mode) {
525 mutex_unlock(&dir->i_mutex);
526 expiring = autofs4_lookup_expiring(sbi,
527 dentry->d_parent,
528 &dentry->d_name);
529 if (expiring) {
530 /*
531 * If we are racing with expire the request might not
532 * be quite complete but the directory has been removed
533 * so it must have been successful, so just wait for it.
534 */
535 ino = autofs4_dentry_ino(expiring);
536 autofs4_expire_wait(expiring);
537 spin_lock(&sbi->lookup_lock);
538 if (!list_empty(&ino->expiring))
539 list_del_init(&ino->expiring);
540 spin_unlock(&sbi->lookup_lock);
541 dput(expiring);
542 }
543
541 spin_lock(&dentry->d_lock); 544 spin_lock(&dentry->d_lock);
542 dentry->d_flags |= DCACHE_AUTOFS_PENDING; 545 dentry->d_flags |= DCACHE_AUTOFS_PENDING;
543 spin_unlock(&dentry->d_lock); 546 spin_unlock(&dentry->d_lock);
544 if (dentry->d_op && dentry->d_op->d_revalidate) { 547 if (dentry->d_op && dentry->d_op->d_revalidate)
545 mutex_unlock(&dir->i_mutex);
546 (dentry->d_op->d_revalidate)(dentry, nd); 548 (dentry->d_op->d_revalidate)(dentry, nd);
547 mutex_lock(&dir->i_mutex); 549 mutex_lock(&dir->i_mutex);
548 }
549 } 550 }
550 551
551 /* 552 /*
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b..9adf5e4f7e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ 10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o 11 compression.o delayed-ref.o
12else 12else
13 13
14# Normal Makefile 14# Normal Makefile
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74..b30986f00b9 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /*
70 * list for tracking inodes that must be sent to disk before a
71 * rename or truncate commit
72 */
73 struct list_head ordered_operations;
74
69 /* the space_info for where this inode's data allocations are done */ 75 /* the space_info for where this inode's data allocations are done */
70 struct btrfs_space_info *space_info; 76 struct btrfs_space_info *space_info;
71 77
@@ -86,12 +92,6 @@ struct btrfs_inode {
86 */ 92 */
87 u64 logged_trans; 93 u64 logged_trans;
88 94
89 /*
90 * trans that last made a change that should be fully fsync'd. This
91 * gets reset to zero each time the inode is logged
92 */
93 u64 log_dirty_trans;
94
95 /* total number of bytes pending delalloc, used by stat to calc the 95 /* total number of bytes pending delalloc, used by stat to calc the
96 * real block usage of the file 96 * real block usage of the file
97 */ 97 */
@@ -121,6 +121,25 @@ struct btrfs_inode {
121 /* the start of block group preferred for allocations. */ 121 /* the start of block group preferred for allocations. */
122 u64 block_group; 122 u64 block_group;
123 123
124 /* the fsync log has some corner cases that mean we have to check
125 * directories to see if any unlinks have been done before
126 * the directory was logged. See tree-log.c for all the
127 * details
128 */
129 u64 last_unlink_trans;
130
131 /*
132 * ordered_data_close is set by truncate when a file that used
133 * to have good data has been truncated to zero. When it is set
134 * the btrfs file release call will add this inode to the
135 * ordered operations list so that we make sure to flush out any
136 * new data the application may have written before commit.
137 *
138 * yes, its silly to have a single bitflag, but we might grow more
139 * of these.
140 */
141 unsigned ordered_data_close:1;
142
124 struct inode vfs_inode; 143 struct inode vfs_inode;
125}; 144};
126 145
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529a..dbb72412463 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
254 * empty_size -- a hint that you plan on doing more cow. This is the size in 254 * empty_size -- a hint that you plan on doing more cow. This is the size in
255 * bytes the allocator should try to find free next to the block it returns. 255 * bytes the allocator should try to find free next to the block it returns.
256 * This is just a hint and may be ignored by the allocator. 256 * This is just a hint and may be ignored by the allocator.
257 *
258 * prealloc_dest -- if you have already reserved a destination for the cow,
259 * this uses that block instead of allocating a new one.
260 * btrfs_alloc_reserved_extent is used to finish the allocation.
261 */ 257 */
262static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 258static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
263 struct btrfs_root *root, 259 struct btrfs_root *root,
264 struct extent_buffer *buf, 260 struct extent_buffer *buf,
265 struct extent_buffer *parent, int parent_slot, 261 struct extent_buffer *parent, int parent_slot,
266 struct extent_buffer **cow_ret, 262 struct extent_buffer **cow_ret,
267 u64 search_start, u64 empty_size, 263 u64 search_start, u64 empty_size)
268 u64 prealloc_dest)
269{ 264{
270 u64 parent_start; 265 u64 parent_start;
271 struct extent_buffer *cow; 266 struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
291 level = btrfs_header_level(buf); 286 level = btrfs_header_level(buf);
292 nritems = btrfs_header_nritems(buf); 287 nritems = btrfs_header_nritems(buf);
293 288
294 if (prealloc_dest) { 289 cow = btrfs_alloc_free_block(trans, root, buf->len,
295 struct btrfs_key ins; 290 parent_start, root->root_key.objectid,
296 291 trans->transid, level,
297 ins.objectid = prealloc_dest; 292 search_start, empty_size);
298 ins.offset = buf->len;
299 ins.type = BTRFS_EXTENT_ITEM_KEY;
300
301 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
302 root->root_key.objectid,
303 trans->transid, level, &ins);
304 BUG_ON(ret);
305 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
306 buf->len, level);
307 } else {
308 cow = btrfs_alloc_free_block(trans, root, buf->len,
309 parent_start,
310 root->root_key.objectid,
311 trans->transid, level,
312 search_start, empty_size);
313 }
314 if (IS_ERR(cow)) 293 if (IS_ERR(cow))
315 return PTR_ERR(cow); 294 return PTR_ERR(cow);
316 295
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
413noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 392noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
414 struct btrfs_root *root, struct extent_buffer *buf, 393 struct btrfs_root *root, struct extent_buffer *buf,
415 struct extent_buffer *parent, int parent_slot, 394 struct extent_buffer *parent, int parent_slot,
416 struct extent_buffer **cow_ret, u64 prealloc_dest) 395 struct extent_buffer **cow_ret)
417{ 396{
418 u64 search_start; 397 u64 search_start;
419 int ret; 398 int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
436 btrfs_header_owner(buf) == root->root_key.objectid && 415 btrfs_header_owner(buf) == root->root_key.objectid &&
437 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 416 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
438 *cow_ret = buf; 417 *cow_ret = buf;
439 WARN_ON(prealloc_dest);
440 return 0; 418 return 0;
441 } 419 }
442 420
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
447 btrfs_set_lock_blocking(buf); 425 btrfs_set_lock_blocking(buf);
448 426
449 ret = __btrfs_cow_block(trans, root, buf, parent, 427 ret = __btrfs_cow_block(trans, root, buf, parent,
450 parent_slot, cow_ret, search_start, 0, 428 parent_slot, cow_ret, search_start, 0);
451 prealloc_dest);
452 return ret; 429 return ret;
453} 430}
454 431
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
617 err = __btrfs_cow_block(trans, root, cur, parent, i, 594 err = __btrfs_cow_block(trans, root, cur, parent, i,
618 &cur, search_start, 595 &cur, search_start,
619 min(16 * blocksize, 596 min(16 * blocksize,
620 (end_slot - i) * blocksize), 0); 597 (end_slot - i) * blocksize));
621 if (err) { 598 if (err) {
622 btrfs_tree_unlock(cur); 599 btrfs_tree_unlock(cur);
623 free_extent_buffer(cur); 600 free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
937 BUG_ON(!child); 914 BUG_ON(!child);
938 btrfs_tree_lock(child); 915 btrfs_tree_lock(child);
939 btrfs_set_lock_blocking(child); 916 btrfs_set_lock_blocking(child);
940 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 917 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
941 BUG_ON(ret); 918 BUG_ON(ret);
942 919
943 spin_lock(&root->node_lock); 920 spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
945 spin_unlock(&root->node_lock); 922 spin_unlock(&root->node_lock);
946 923
947 ret = btrfs_update_extent_ref(trans, root, child->start, 924 ret = btrfs_update_extent_ref(trans, root, child->start,
925 child->len,
948 mid->start, child->start, 926 mid->start, child->start,
949 root->root_key.objectid, 927 root->root_key.objectid,
950 trans->transid, level - 1); 928 trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
971 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 949 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
972 return 0; 950 return 0;
973 951
952 if (trans->transaction->delayed_refs.flushing &&
953 btrfs_header_nritems(mid) > 2)
954 return 0;
955
974 if (btrfs_header_nritems(mid) < 2) 956 if (btrfs_header_nritems(mid) < 2)
975 err_on_enospc = 1; 957 err_on_enospc = 1;
976 958
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
979 btrfs_tree_lock(left); 961 btrfs_tree_lock(left);
980 btrfs_set_lock_blocking(left); 962 btrfs_set_lock_blocking(left);
981 wret = btrfs_cow_block(trans, root, left, 963 wret = btrfs_cow_block(trans, root, left,
982 parent, pslot - 1, &left, 0); 964 parent, pslot - 1, &left);
983 if (wret) { 965 if (wret) {
984 ret = wret; 966 ret = wret;
985 goto enospc; 967 goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
990 btrfs_tree_lock(right); 972 btrfs_tree_lock(right);
991 btrfs_set_lock_blocking(right); 973 btrfs_set_lock_blocking(right);
992 wret = btrfs_cow_block(trans, root, right, 974 wret = btrfs_cow_block(trans, root, right,
993 parent, pslot + 1, &right, 0); 975 parent, pslot + 1, &right);
994 if (wret) { 976 if (wret) {
995 ret = wret; 977 ret = wret;
996 goto enospc; 978 goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1171 wret = 1; 1153 wret = 1;
1172 } else { 1154 } else {
1173 ret = btrfs_cow_block(trans, root, left, parent, 1155 ret = btrfs_cow_block(trans, root, left, parent,
1174 pslot - 1, &left, 0); 1156 pslot - 1, &left);
1175 if (ret) 1157 if (ret)
1176 wret = 1; 1158 wret = 1;
1177 else { 1159 else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1222 } else { 1204 } else {
1223 ret = btrfs_cow_block(trans, root, right, 1205 ret = btrfs_cow_block(trans, root, right,
1224 parent, pslot + 1, 1206 parent, pslot + 1,
1225 &right, 0); 1207 &right);
1226 if (ret) 1208 if (ret)
1227 wret = 1; 1209 wret = 1;
1228 else { 1210 else {
@@ -1492,7 +1474,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1492 u8 lowest_level = 0; 1474 u8 lowest_level = 0;
1493 u64 blocknr; 1475 u64 blocknr;
1494 u64 gen; 1476 u64 gen;
1495 struct btrfs_key prealloc_block;
1496 1477
1497 lowest_level = p->lowest_level; 1478 lowest_level = p->lowest_level;
1498 WARN_ON(lowest_level && ins_len > 0); 1479 WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1482,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1501 if (ins_len < 0) 1482 if (ins_len < 0)
1502 lowest_unlock = 2; 1483 lowest_unlock = 2;
1503 1484
1504 prealloc_block.objectid = 0;
1505
1506again: 1485again:
1507 if (p->skip_locking) 1486 if (p->skip_locking)
1508 b = btrfs_root_node(root); 1487 b = btrfs_root_node(root);
@@ -1529,44 +1508,11 @@ again:
1529 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1508 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1530 goto cow_done; 1509 goto cow_done;
1531 } 1510 }
1532
1533 /* ok, we have to cow, is our old prealloc the right
1534 * size?
1535 */
1536 if (prealloc_block.objectid &&
1537 prealloc_block.offset != b->len) {
1538 btrfs_release_path(root, p);
1539 btrfs_free_reserved_extent(root,
1540 prealloc_block.objectid,
1541 prealloc_block.offset);
1542 prealloc_block.objectid = 0;
1543 goto again;
1544 }
1545
1546 /*
1547 * for higher level blocks, try not to allocate blocks
1548 * with the block and the parent locks held.
1549 */
1550 if (level > 0 && !prealloc_block.objectid) {
1551 u32 size = b->len;
1552 u64 hint = b->start;
1553
1554 btrfs_release_path(root, p);
1555 ret = btrfs_reserve_extent(trans, root,
1556 size, size, 0,
1557 hint, (u64)-1,
1558 &prealloc_block, 0);
1559 BUG_ON(ret);
1560 goto again;
1561 }
1562
1563 btrfs_set_path_blocking(p); 1511 btrfs_set_path_blocking(p);
1564 1512
1565 wret = btrfs_cow_block(trans, root, b, 1513 wret = btrfs_cow_block(trans, root, b,
1566 p->nodes[level + 1], 1514 p->nodes[level + 1],
1567 p->slots[level + 1], 1515 p->slots[level + 1], &b);
1568 &b, prealloc_block.objectid);
1569 prealloc_block.objectid = 0;
1570 if (wret) { 1516 if (wret) {
1571 free_extent_buffer(b); 1517 free_extent_buffer(b);
1572 ret = wret; 1518 ret = wret;
@@ -1742,12 +1688,8 @@ done:
1742 * we don't really know what they plan on doing with the path 1688 * we don't really know what they plan on doing with the path
1743 * from here on, so for now just mark it as blocking 1689 * from here on, so for now just mark it as blocking
1744 */ 1690 */
1745 btrfs_set_path_blocking(p); 1691 if (!p->leave_spinning)
1746 if (prealloc_block.objectid) { 1692 btrfs_set_path_blocking(p);
1747 btrfs_free_reserved_extent(root,
1748 prealloc_block.objectid,
1749 prealloc_block.offset);
1750 }
1751 return ret; 1693 return ret;
1752} 1694}
1753 1695
@@ -1768,7 +1710,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1768 int ret; 1710 int ret;
1769 1711
1770 eb = btrfs_lock_root_node(root); 1712 eb = btrfs_lock_root_node(root);
1771 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1713 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1772 BUG_ON(ret); 1714 BUG_ON(ret);
1773 1715
1774 btrfs_set_lock_blocking(eb); 1716 btrfs_set_lock_blocking(eb);
@@ -1826,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1826 } 1768 }
1827 1769
1828 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1770 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1829 &eb, 0); 1771 &eb);
1830 BUG_ON(ret); 1772 BUG_ON(ret);
1831 1773
1832 if (root->root_key.objectid == 1774 if (root->root_key.objectid ==
@@ -2139,7 +2081,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2139 spin_unlock(&root->node_lock); 2081 spin_unlock(&root->node_lock);
2140 2082
2141 ret = btrfs_update_extent_ref(trans, root, lower->start, 2083 ret = btrfs_update_extent_ref(trans, root, lower->start,
2142 lower->start, c->start, 2084 lower->len, lower->start, c->start,
2143 root->root_key.objectid, 2085 root->root_key.objectid,
2144 trans->transid, level - 1); 2086 trans->transid, level - 1);
2145 BUG_ON(ret); 2087 BUG_ON(ret);
@@ -2221,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2221 ret = insert_new_root(trans, root, path, level + 1); 2163 ret = insert_new_root(trans, root, path, level + 1);
2222 if (ret) 2164 if (ret)
2223 return ret; 2165 return ret;
2224 } else { 2166 } else if (!trans->transaction->delayed_refs.flushing) {
2225 ret = push_nodes_for_insert(trans, root, path, level); 2167 ret = push_nodes_for_insert(trans, root, path, level);
2226 c = path->nodes[level]; 2168 c = path->nodes[level];
2227 if (!ret && btrfs_header_nritems(c) < 2169 if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2271,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2329 return ret; 2271 return ret;
2330} 2272}
2331 2273
2332/* 2274static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2333 * push some data in the path leaf to the right, trying to free up at 2275 struct btrfs_root *root,
2334 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2276 struct btrfs_path *path,
2335 * 2277 int data_size, int empty,
2336 * returns 1 if the push failed because the other node didn't have enough 2278 struct extent_buffer *right,
2337 * room, 0 if everything worked out and < 0 if there were major errors. 2279 int free_space, u32 left_nritems)
2338 */
2339static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2340 *root, struct btrfs_path *path, int data_size,
2341 int empty)
2342{ 2280{
2343 struct extent_buffer *left = path->nodes[0]; 2281 struct extent_buffer *left = path->nodes[0];
2344 struct extent_buffer *right; 2282 struct extent_buffer *upper = path->nodes[1];
2345 struct extent_buffer *upper;
2346 struct btrfs_disk_key disk_key; 2283 struct btrfs_disk_key disk_key;
2347 int slot; 2284 int slot;
2348 u32 i; 2285 u32 i;
2349 int free_space;
2350 int push_space = 0; 2286 int push_space = 0;
2351 int push_items = 0; 2287 int push_items = 0;
2352 struct btrfs_item *item; 2288 struct btrfs_item *item;
2353 u32 left_nritems;
2354 u32 nr; 2289 u32 nr;
2355 u32 right_nritems; 2290 u32 right_nritems;
2356 u32 data_end; 2291 u32 data_end;
2357 u32 this_item_size; 2292 u32 this_item_size;
2358 int ret; 2293 int ret;
2359 2294
2360 slot = path->slots[1];
2361 if (!path->nodes[1])
2362 return 1;
2363
2364 upper = path->nodes[1];
2365 if (slot >= btrfs_header_nritems(upper) - 1)
2366 return 1;
2367
2368 btrfs_assert_tree_locked(path->nodes[1]);
2369
2370 right = read_node_slot(root, upper, slot + 1);
2371 btrfs_tree_lock(right);
2372 btrfs_set_lock_blocking(right);
2373
2374 free_space = btrfs_leaf_free_space(root, right);
2375 if (free_space < data_size)
2376 goto out_unlock;
2377
2378 /* cow and double check */
2379 ret = btrfs_cow_block(trans, root, right, upper,
2380 slot + 1, &right, 0);
2381 if (ret)
2382 goto out_unlock;
2383
2384 free_space = btrfs_leaf_free_space(root, right);
2385 if (free_space < data_size)
2386 goto out_unlock;
2387
2388 left_nritems = btrfs_header_nritems(left);
2389 if (left_nritems == 0)
2390 goto out_unlock;
2391
2392 if (empty) 2295 if (empty)
2393 nr = 0; 2296 nr = 0;
2394 else 2297 else
@@ -2397,6 +2300,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2397 if (path->slots[0] >= left_nritems) 2300 if (path->slots[0] >= left_nritems)
2398 push_space += data_size; 2301 push_space += data_size;
2399 2302
2303 slot = path->slots[1];
2400 i = left_nritems - 1; 2304 i = left_nritems - 1;
2401 while (i >= nr) { 2305 while (i >= nr) {
2402 item = btrfs_item_nr(left, i); 2306 item = btrfs_item_nr(left, i);
@@ -2528,24 +2432,82 @@ out_unlock:
2528} 2432}
2529 2433
2530/* 2434/*
2435 * push some data in the path leaf to the right, trying to free up at
2436 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2437 *
2438 * returns 1 if the push failed because the other node didn't have enough
2439 * room, 0 if everything worked out and < 0 if there were major errors.
2440 */
2441static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2442 *root, struct btrfs_path *path, int data_size,
2443 int empty)
2444{
2445 struct extent_buffer *left = path->nodes[0];
2446 struct extent_buffer *right;
2447 struct extent_buffer *upper;
2448 int slot;
2449 int free_space;
2450 u32 left_nritems;
2451 int ret;
2452
2453 if (!path->nodes[1])
2454 return 1;
2455
2456 slot = path->slots[1];
2457 upper = path->nodes[1];
2458 if (slot >= btrfs_header_nritems(upper) - 1)
2459 return 1;
2460
2461 btrfs_assert_tree_locked(path->nodes[1]);
2462
2463 right = read_node_slot(root, upper, slot + 1);
2464 btrfs_tree_lock(right);
2465 btrfs_set_lock_blocking(right);
2466
2467 free_space = btrfs_leaf_free_space(root, right);
2468 if (free_space < data_size)
2469 goto out_unlock;
2470
2471 /* cow and double check */
2472 ret = btrfs_cow_block(trans, root, right, upper,
2473 slot + 1, &right);
2474 if (ret)
2475 goto out_unlock;
2476
2477 free_space = btrfs_leaf_free_space(root, right);
2478 if (free_space < data_size)
2479 goto out_unlock;
2480
2481 left_nritems = btrfs_header_nritems(left);
2482 if (left_nritems == 0)
2483 goto out_unlock;
2484
2485 return __push_leaf_right(trans, root, path, data_size, empty,
2486 right, free_space, left_nritems);
2487out_unlock:
2488 btrfs_tree_unlock(right);
2489 free_extent_buffer(right);
2490 return 1;
2491}
2492
2493/*
2531 * push some data in the path leaf to the left, trying to free up at 2494 * push some data in the path leaf to the left, trying to free up at
2532 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2495 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2533 */ 2496 */
2534static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2497static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2535 *root, struct btrfs_path *path, int data_size, 2498 struct btrfs_root *root,
2536 int empty) 2499 struct btrfs_path *path, int data_size,
2500 int empty, struct extent_buffer *left,
2501 int free_space, int right_nritems)
2537{ 2502{
2538 struct btrfs_disk_key disk_key; 2503 struct btrfs_disk_key disk_key;
2539 struct extent_buffer *right = path->nodes[0]; 2504 struct extent_buffer *right = path->nodes[0];
2540 struct extent_buffer *left;
2541 int slot; 2505 int slot;
2542 int i; 2506 int i;
2543 int free_space;
2544 int push_space = 0; 2507 int push_space = 0;
2545 int push_items = 0; 2508 int push_items = 0;
2546 struct btrfs_item *item; 2509 struct btrfs_item *item;
2547 u32 old_left_nritems; 2510 u32 old_left_nritems;
2548 u32 right_nritems;
2549 u32 nr; 2511 u32 nr;
2550 int ret = 0; 2512 int ret = 0;
2551 int wret; 2513 int wret;
@@ -2553,41 +2515,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2553 u32 old_left_item_size; 2515 u32 old_left_item_size;
2554 2516
2555 slot = path->slots[1]; 2517 slot = path->slots[1];
2556 if (slot == 0)
2557 return 1;
2558 if (!path->nodes[1])
2559 return 1;
2560
2561 right_nritems = btrfs_header_nritems(right);
2562 if (right_nritems == 0)
2563 return 1;
2564
2565 btrfs_assert_tree_locked(path->nodes[1]);
2566
2567 left = read_node_slot(root, path->nodes[1], slot - 1);
2568 btrfs_tree_lock(left);
2569 btrfs_set_lock_blocking(left);
2570
2571 free_space = btrfs_leaf_free_space(root, left);
2572 if (free_space < data_size) {
2573 ret = 1;
2574 goto out;
2575 }
2576
2577 /* cow and double check */
2578 ret = btrfs_cow_block(trans, root, left,
2579 path->nodes[1], slot - 1, &left, 0);
2580 if (ret) {
2581 /* we hit -ENOSPC, but it isn't fatal here */
2582 ret = 1;
2583 goto out;
2584 }
2585
2586 free_space = btrfs_leaf_free_space(root, left);
2587 if (free_space < data_size) {
2588 ret = 1;
2589 goto out;
2590 }
2591 2518
2592 if (empty) 2519 if (empty)
2593 nr = right_nritems; 2520 nr = right_nritems;
@@ -2755,6 +2682,154 @@ out:
2755} 2682}
2756 2683
2757/* 2684/*
2685 * push some data in the path leaf to the left, trying to free up at
2686 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2687 */
2688static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2689 *root, struct btrfs_path *path, int data_size,
2690 int empty)
2691{
2692 struct extent_buffer *right = path->nodes[0];
2693 struct extent_buffer *left;
2694 int slot;
2695 int free_space;
2696 u32 right_nritems;
2697 int ret = 0;
2698
2699 slot = path->slots[1];
2700 if (slot == 0)
2701 return 1;
2702 if (!path->nodes[1])
2703 return 1;
2704
2705 right_nritems = btrfs_header_nritems(right);
2706 if (right_nritems == 0)
2707 return 1;
2708
2709 btrfs_assert_tree_locked(path->nodes[1]);
2710
2711 left = read_node_slot(root, path->nodes[1], slot - 1);
2712 btrfs_tree_lock(left);
2713 btrfs_set_lock_blocking(left);
2714
2715 free_space = btrfs_leaf_free_space(root, left);
2716 if (free_space < data_size) {
2717 ret = 1;
2718 goto out;
2719 }
2720
2721 /* cow and double check */
2722 ret = btrfs_cow_block(trans, root, left,
2723 path->nodes[1], slot - 1, &left);
2724 if (ret) {
2725 /* we hit -ENOSPC, but it isn't fatal here */
2726 ret = 1;
2727 goto out;
2728 }
2729
2730 free_space = btrfs_leaf_free_space(root, left);
2731 if (free_space < data_size) {
2732 ret = 1;
2733 goto out;
2734 }
2735
2736 return __push_leaf_left(trans, root, path, data_size,
2737 empty, left, free_space, right_nritems);
2738out:
2739 btrfs_tree_unlock(left);
2740 free_extent_buffer(left);
2741 return ret;
2742}
2743
2744/*
2745 * split the path's leaf in two, making sure there is at least data_size
2746 * available for the resulting leaf level of the path.
2747 *
2748 * returns 0 if all went well and < 0 on failure.
2749 */
2750static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2751 struct btrfs_root *root,
2752 struct btrfs_path *path,
2753 struct extent_buffer *l,
2754 struct extent_buffer *right,
2755 int slot, int mid, int nritems)
2756{
2757 int data_copy_size;
2758 int rt_data_off;
2759 int i;
2760 int ret = 0;
2761 int wret;
2762 struct btrfs_disk_key disk_key;
2763
2764 nritems = nritems - mid;
2765 btrfs_set_header_nritems(right, nritems);
2766 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2767
2768 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2769 btrfs_item_nr_offset(mid),
2770 nritems * sizeof(struct btrfs_item));
2771
2772 copy_extent_buffer(right, l,
2773 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2774 data_copy_size, btrfs_leaf_data(l) +
2775 leaf_data_end(root, l), data_copy_size);
2776
2777 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2778 btrfs_item_end_nr(l, mid);
2779
2780 for (i = 0; i < nritems; i++) {
2781 struct btrfs_item *item = btrfs_item_nr(right, i);
2782 u32 ioff;
2783
2784 if (!right->map_token) {
2785 map_extent_buffer(right, (unsigned long)item,
2786 sizeof(struct btrfs_item),
2787 &right->map_token, &right->kaddr,
2788 &right->map_start, &right->map_len,
2789 KM_USER1);
2790 }
2791
2792 ioff = btrfs_item_offset(right, item);
2793 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2794 }
2795
2796 if (right->map_token) {
2797 unmap_extent_buffer(right, right->map_token, KM_USER1);
2798 right->map_token = NULL;
2799 }
2800
2801 btrfs_set_header_nritems(l, mid);
2802 ret = 0;
2803 btrfs_item_key(right, &disk_key, 0);
2804 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2805 path->slots[1] + 1, 1);
2806 if (wret)
2807 ret = wret;
2808
2809 btrfs_mark_buffer_dirty(right);
2810 btrfs_mark_buffer_dirty(l);
2811 BUG_ON(path->slots[0] != slot);
2812
2813 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2814 BUG_ON(ret);
2815
2816 if (mid <= slot) {
2817 btrfs_tree_unlock(path->nodes[0]);
2818 free_extent_buffer(path->nodes[0]);
2819 path->nodes[0] = right;
2820 path->slots[0] -= mid;
2821 path->slots[1] += 1;
2822 } else {
2823 btrfs_tree_unlock(right);
2824 free_extent_buffer(right);
2825 }
2826
2827 BUG_ON(path->slots[0] < 0);
2828
2829 return ret;
2830}
2831
2832/*
2758 * split the path's leaf in two, making sure there is at least data_size 2833 * split the path's leaf in two, making sure there is at least data_size
2759 * available for the resulting leaf level of the path. 2834 * available for the resulting leaf level of the path.
2760 * 2835 *
@@ -2771,17 +2846,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2771 int mid; 2846 int mid;
2772 int slot; 2847 int slot;
2773 struct extent_buffer *right; 2848 struct extent_buffer *right;
2774 int data_copy_size;
2775 int rt_data_off;
2776 int i;
2777 int ret = 0; 2849 int ret = 0;
2778 int wret; 2850 int wret;
2779 int double_split; 2851 int double_split;
2780 int num_doubles = 0; 2852 int num_doubles = 0;
2781 struct btrfs_disk_key disk_key;
2782 2853
2783 /* first try to make some room by pushing left and right */ 2854 /* first try to make some room by pushing left and right */
2784 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2855 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
2856 !trans->transaction->delayed_refs.flushing) {
2785 wret = push_leaf_right(trans, root, path, data_size, 0); 2857 wret = push_leaf_right(trans, root, path, data_size, 0);
2786 if (wret < 0) 2858 if (wret < 0)
2787 return wret; 2859 return wret;
@@ -2830,11 +2902,14 @@ again:
2830 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 2902 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2831 (unsigned long)btrfs_header_chunk_tree_uuid(right), 2903 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2832 BTRFS_UUID_SIZE); 2904 BTRFS_UUID_SIZE);
2905
2833 if (mid <= slot) { 2906 if (mid <= slot) {
2834 if (nritems == 1 || 2907 if (nritems == 1 ||
2835 leaf_space_used(l, mid, nritems - mid) + data_size > 2908 leaf_space_used(l, mid, nritems - mid) + data_size >
2836 BTRFS_LEAF_DATA_SIZE(root)) { 2909 BTRFS_LEAF_DATA_SIZE(root)) {
2837 if (slot >= nritems) { 2910 if (slot >= nritems) {
2911 struct btrfs_disk_key disk_key;
2912
2838 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2913 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2839 btrfs_set_header_nritems(right, 0); 2914 btrfs_set_header_nritems(right, 0);
2840 wret = insert_ptr(trans, root, path, 2915 wret = insert_ptr(trans, root, path,
@@ -2862,6 +2937,8 @@ again:
2862 if (leaf_space_used(l, 0, mid) + data_size > 2937 if (leaf_space_used(l, 0, mid) + data_size >
2863 BTRFS_LEAF_DATA_SIZE(root)) { 2938 BTRFS_LEAF_DATA_SIZE(root)) {
2864 if (!extend && data_size && slot == 0) { 2939 if (!extend && data_size && slot == 0) {
2940 struct btrfs_disk_key disk_key;
2941
2865 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2942 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2866 btrfs_set_header_nritems(right, 0); 2943 btrfs_set_header_nritems(right, 0);
2867 wret = insert_ptr(trans, root, path, 2944 wret = insert_ptr(trans, root, path,
@@ -2894,76 +2971,16 @@ again:
2894 } 2971 }
2895 } 2972 }
2896 } 2973 }
2897 nritems = nritems - mid;
2898 btrfs_set_header_nritems(right, nritems);
2899 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2900
2901 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2902 btrfs_item_nr_offset(mid),
2903 nritems * sizeof(struct btrfs_item));
2904
2905 copy_extent_buffer(right, l,
2906 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2907 data_copy_size, btrfs_leaf_data(l) +
2908 leaf_data_end(root, l), data_copy_size);
2909
2910 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2911 btrfs_item_end_nr(l, mid);
2912
2913 for (i = 0; i < nritems; i++) {
2914 struct btrfs_item *item = btrfs_item_nr(right, i);
2915 u32 ioff;
2916
2917 if (!right->map_token) {
2918 map_extent_buffer(right, (unsigned long)item,
2919 sizeof(struct btrfs_item),
2920 &right->map_token, &right->kaddr,
2921 &right->map_start, &right->map_len,
2922 KM_USER1);
2923 }
2924
2925 ioff = btrfs_item_offset(right, item);
2926 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2927 }
2928
2929 if (right->map_token) {
2930 unmap_extent_buffer(right, right->map_token, KM_USER1);
2931 right->map_token = NULL;
2932 }
2933
2934 btrfs_set_header_nritems(l, mid);
2935 ret = 0;
2936 btrfs_item_key(right, &disk_key, 0);
2937 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2938 path->slots[1] + 1, 1);
2939 if (wret)
2940 ret = wret;
2941
2942 btrfs_mark_buffer_dirty(right);
2943 btrfs_mark_buffer_dirty(l);
2944 BUG_ON(path->slots[0] != slot);
2945 2974
2946 ret = btrfs_update_ref(trans, root, l, right, 0, nritems); 2975 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
2947 BUG_ON(ret); 2976 BUG_ON(ret);
2948 2977
2949 if (mid <= slot) {
2950 btrfs_tree_unlock(path->nodes[0]);
2951 free_extent_buffer(path->nodes[0]);
2952 path->nodes[0] = right;
2953 path->slots[0] -= mid;
2954 path->slots[1] += 1;
2955 } else {
2956 btrfs_tree_unlock(right);
2957 free_extent_buffer(right);
2958 }
2959
2960 BUG_ON(path->slots[0] < 0);
2961
2962 if (double_split) { 2978 if (double_split) {
2963 BUG_ON(num_doubles != 0); 2979 BUG_ON(num_doubles != 0);
2964 num_doubles++; 2980 num_doubles++;
2965 goto again; 2981 goto again;
2966 } 2982 }
2983
2967 return ret; 2984 return ret;
2968} 2985}
2969 2986
@@ -3021,26 +3038,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
3021 return -EAGAIN; 3038 return -EAGAIN;
3022 } 3039 }
3023 3040
3041 btrfs_set_path_blocking(path);
3024 ret = split_leaf(trans, root, &orig_key, path, 3042 ret = split_leaf(trans, root, &orig_key, path,
3025 sizeof(struct btrfs_item), 1); 3043 sizeof(struct btrfs_item), 1);
3026 path->keep_locks = 0; 3044 path->keep_locks = 0;
3027 BUG_ON(ret); 3045 BUG_ON(ret);
3028 3046
3047 btrfs_unlock_up_safe(path, 1);
3048 leaf = path->nodes[0];
3049 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3050
3051split:
3029 /* 3052 /*
3030 * make sure any changes to the path from split_leaf leave it 3053 * make sure any changes to the path from split_leaf leave it
3031 * in a blocking state 3054 * in a blocking state
3032 */ 3055 */
3033 btrfs_set_path_blocking(path); 3056 btrfs_set_path_blocking(path);
3034 3057
3035 leaf = path->nodes[0];
3036 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3037
3038split:
3039 item = btrfs_item_nr(leaf, path->slots[0]); 3058 item = btrfs_item_nr(leaf, path->slots[0]);
3040 orig_offset = btrfs_item_offset(leaf, item); 3059 orig_offset = btrfs_item_offset(leaf, item);
3041 item_size = btrfs_item_size(leaf, item); 3060 item_size = btrfs_item_size(leaf, item);
3042 3061
3043
3044 buf = kmalloc(item_size, GFP_NOFS); 3062 buf = kmalloc(item_size, GFP_NOFS);
3045 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3063 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3046 path->slots[0]), item_size); 3064 path->slots[0]), item_size);
@@ -3445,39 +3463,27 @@ out:
3445} 3463}
3446 3464
3447/* 3465/*
3448 * Given a key and some data, insert items into the tree. 3466 * this is a helper for btrfs_insert_empty_items, the main goal here is
3449 * This does all the path init required, making room in the tree if needed. 3467 * to save stack depth by doing the bulk of the work in a function
3468 * that doesn't call btrfs_search_slot
3450 */ 3469 */
3451int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3470static noinline_for_stack int
3452 struct btrfs_root *root, 3471setup_items_for_insert(struct btrfs_trans_handle *trans,
3453 struct btrfs_path *path, 3472 struct btrfs_root *root, struct btrfs_path *path,
3454 struct btrfs_key *cpu_key, u32 *data_size, 3473 struct btrfs_key *cpu_key, u32 *data_size,
3455 int nr) 3474 u32 total_data, u32 total_size, int nr)
3456{ 3475{
3457 struct extent_buffer *leaf;
3458 struct btrfs_item *item; 3476 struct btrfs_item *item;
3459 int ret = 0;
3460 int slot;
3461 int slot_orig;
3462 int i; 3477 int i;
3463 u32 nritems; 3478 u32 nritems;
3464 u32 total_size = 0;
3465 u32 total_data = 0;
3466 unsigned int data_end; 3479 unsigned int data_end;
3467 struct btrfs_disk_key disk_key; 3480 struct btrfs_disk_key disk_key;
3481 int ret;
3482 struct extent_buffer *leaf;
3483 int slot;
3468 3484
3469 for (i = 0; i < nr; i++)
3470 total_data += data_size[i];
3471
3472 total_size = total_data + (nr * sizeof(struct btrfs_item));
3473 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3474 if (ret == 0)
3475 return -EEXIST;
3476 if (ret < 0)
3477 goto out;
3478
3479 slot_orig = path->slots[0];
3480 leaf = path->nodes[0]; 3485 leaf = path->nodes[0];
3486 slot = path->slots[0];
3481 3487
3482 nritems = btrfs_header_nritems(leaf); 3488 nritems = btrfs_header_nritems(leaf);
3483 data_end = leaf_data_end(root, leaf); 3489 data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3495,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3489 BUG(); 3495 BUG();
3490 } 3496 }
3491 3497
3492 slot = path->slots[0];
3493 BUG_ON(slot < 0);
3494
3495 if (slot != nritems) { 3498 if (slot != nritems) {
3496 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 3499 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3497 3500
@@ -3547,21 +3550,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3547 data_end -= data_size[i]; 3550 data_end -= data_size[i];
3548 btrfs_set_item_size(leaf, item, data_size[i]); 3551 btrfs_set_item_size(leaf, item, data_size[i]);
3549 } 3552 }
3553
3550 btrfs_set_header_nritems(leaf, nritems + nr); 3554 btrfs_set_header_nritems(leaf, nritems + nr);
3551 btrfs_mark_buffer_dirty(leaf);
3552 3555
3553 ret = 0; 3556 ret = 0;
3554 if (slot == 0) { 3557 if (slot == 0) {
3558 struct btrfs_disk_key disk_key;
3555 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 3559 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3556 ret = fixup_low_keys(trans, root, path, &disk_key, 1); 3560 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3557 } 3561 }
3562 btrfs_unlock_up_safe(path, 1);
3563 btrfs_mark_buffer_dirty(leaf);
3558 3564
3559 if (btrfs_leaf_free_space(root, leaf) < 0) { 3565 if (btrfs_leaf_free_space(root, leaf) < 0) {
3560 btrfs_print_leaf(root, leaf); 3566 btrfs_print_leaf(root, leaf);
3561 BUG(); 3567 BUG();
3562 } 3568 }
3569 return ret;
3570}
3571
3572/*
3573 * Given a key and some data, insert items into the tree.
3574 * This does all the path init required, making room in the tree if needed.
3575 */
3576int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3577 struct btrfs_root *root,
3578 struct btrfs_path *path,
3579 struct btrfs_key *cpu_key, u32 *data_size,
3580 int nr)
3581{
3582 struct extent_buffer *leaf;
3583 int ret = 0;
3584 int slot;
3585 int i;
3586 u32 total_size = 0;
3587 u32 total_data = 0;
3588
3589 for (i = 0; i < nr; i++)
3590 total_data += data_size[i];
3591
3592 total_size = total_data + (nr * sizeof(struct btrfs_item));
3593 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3594 if (ret == 0)
3595 return -EEXIST;
3596 if (ret < 0)
3597 goto out;
3598
3599 leaf = path->nodes[0];
3600 slot = path->slots[0];
3601 BUG_ON(slot < 0);
3602
3603 ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3604 total_data, total_size, nr);
3605
3563out: 3606out:
3564 btrfs_unlock_up_safe(path, 1);
3565 return ret; 3607 return ret;
3566} 3608}
3567 3609
@@ -3749,7 +3791,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3749 } 3791 }
3750 3792
3751 /* delete the leaf if it is mostly empty */ 3793 /* delete the leaf if it is mostly empty */
3752 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { 3794 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
3795 !trans->transaction->delayed_refs.flushing) {
3753 /* push_leaf_left fixes the path. 3796 /* push_leaf_left fixes the path.
3754 * make sure the path still points to our leaf 3797 * make sure the path still points to our leaf
3755 * for possible call to del_ptr below 3798 * for possible call to del_ptr below
@@ -3757,6 +3800,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3757 slot = path->slots[1]; 3800 slot = path->slots[1];
3758 extent_buffer_get(leaf); 3801 extent_buffer_get(leaf);
3759 3802
3803 btrfs_set_path_blocking(path);
3760 wret = push_leaf_left(trans, root, path, 1, 1); 3804 wret = push_leaf_left(trans, root, path, 1, 1);
3761 if (wret < 0 && wret != -ENOSPC) 3805 if (wret < 0 && wret != -ENOSPC)
3762 ret = wret; 3806 ret = wret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d..9417713542a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
45 45
46#define BTRFS_MAX_LEVEL 8 46#define BTRFS_MAX_LEVEL 8
47 47
48/*
49 * files bigger than this get some pre-flushing when they are added
50 * to the ordered operations list. That way we limit the total
51 * work done by the commit
52 */
53#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
54
48/* holds pointers to all of the tree roots */ 55/* holds pointers to all of the tree roots */
49#define BTRFS_ROOT_TREE_OBJECTID 1ULL 56#define BTRFS_ROOT_TREE_OBJECTID 1ULL
50 57
@@ -401,15 +408,16 @@ struct btrfs_path {
401 int locks[BTRFS_MAX_LEVEL]; 408 int locks[BTRFS_MAX_LEVEL];
402 int reada; 409 int reada;
403 /* keep some upper locks as we walk down */ 410 /* keep some upper locks as we walk down */
404 int keep_locks;
405 int skip_locking;
406 int lowest_level; 411 int lowest_level;
407 412
408 /* 413 /*
409 * set by btrfs_split_item, tells search_slot to keep all locks 414 * set by btrfs_split_item, tells search_slot to keep all locks
410 * and to force calls to keep space in the nodes 415 * and to force calls to keep space in the nodes
411 */ 416 */
412 int search_for_split; 417 unsigned int search_for_split:1;
418 unsigned int keep_locks:1;
419 unsigned int skip_locking:1;
420 unsigned int leave_spinning:1;
413}; 421};
414 422
415/* 423/*
@@ -688,15 +696,18 @@ struct btrfs_fs_info {
688 struct rb_root block_group_cache_tree; 696 struct rb_root block_group_cache_tree;
689 697
690 struct extent_io_tree pinned_extents; 698 struct extent_io_tree pinned_extents;
691 struct extent_io_tree pending_del;
692 struct extent_io_tree extent_ins;
693 699
694 /* logical->physical extent mapping */ 700 /* logical->physical extent mapping */
695 struct btrfs_mapping_tree mapping_tree; 701 struct btrfs_mapping_tree mapping_tree;
696 702
697 u64 generation; 703 u64 generation;
698 u64 last_trans_committed; 704 u64 last_trans_committed;
699 u64 last_trans_new_blockgroup; 705
706 /*
707 * this is updated to the current trans every time a full commit
708 * is required instead of the faster short fsync log commits
709 */
710 u64 last_trans_log_full_commit;
700 u64 open_ioctl_trans; 711 u64 open_ioctl_trans;
701 unsigned long mount_opt; 712 unsigned long mount_opt;
702 u64 max_extent; 713 u64 max_extent;
@@ -717,12 +728,21 @@ struct btrfs_fs_info {
717 struct mutex tree_log_mutex; 728 struct mutex tree_log_mutex;
718 struct mutex transaction_kthread_mutex; 729 struct mutex transaction_kthread_mutex;
719 struct mutex cleaner_mutex; 730 struct mutex cleaner_mutex;
720 struct mutex extent_ins_mutex;
721 struct mutex pinned_mutex; 731 struct mutex pinned_mutex;
722 struct mutex chunk_mutex; 732 struct mutex chunk_mutex;
723 struct mutex drop_mutex; 733 struct mutex drop_mutex;
724 struct mutex volume_mutex; 734 struct mutex volume_mutex;
725 struct mutex tree_reloc_mutex; 735 struct mutex tree_reloc_mutex;
736
737 /*
738 * this protects the ordered operations list only while we are
739 * processing all of the entries on it. This way we make
740 * sure the commit code doesn't find the list temporarily empty
741 * because another function happens to be doing non-waiting preflush
742 * before jumping into the main commit.
743 */
744 struct mutex ordered_operations_mutex;
745
726 struct list_head trans_list; 746 struct list_head trans_list;
727 struct list_head hashers; 747 struct list_head hashers;
728 struct list_head dead_roots; 748 struct list_head dead_roots;
@@ -737,10 +757,29 @@ struct btrfs_fs_info {
737 * ordered extents 757 * ordered extents
738 */ 758 */
739 spinlock_t ordered_extent_lock; 759 spinlock_t ordered_extent_lock;
760
761 /*
762 * all of the data=ordered extents pending writeback
763 * these can span multiple transactions and basically include
764 * every dirty data page that isn't from nodatacow
765 */
740 struct list_head ordered_extents; 766 struct list_head ordered_extents;
767
768 /*
769 * all of the inodes that have delalloc bytes. It is possible for
770 * this list to be empty even when there is still dirty data=ordered
771 * extents waiting to finish IO.
772 */
741 struct list_head delalloc_inodes; 773 struct list_head delalloc_inodes;
742 774
743 /* 775 /*
776 * special rename and truncate targets that must be on disk before
777 * we're allowed to commit. This is basically the ext3 style
778 * data=ordered list.
779 */
780 struct list_head ordered_operations;
781
782 /*
744 * there is a pool of worker threads for checksumming during writes 783 * there is a pool of worker threads for checksumming during writes
745 * and a pool for checksumming after reads. This is because readers 784 * and a pool for checksumming after reads. This is because readers
746 * can run with FS locks held, and the writers may be waiting for 785 * can run with FS locks held, and the writers may be waiting for
@@ -781,6 +820,11 @@ struct btrfs_fs_info {
781 atomic_t throttle_gen; 820 atomic_t throttle_gen;
782 821
783 u64 total_pinned; 822 u64 total_pinned;
823
824 /* protected by the delalloc lock, used to keep from writing
825 * metadata until there is a nice batch
826 */
827 u64 dirty_metadata_bytes;
784 struct list_head dirty_cowonly_roots; 828 struct list_head dirty_cowonly_roots;
785 829
786 struct btrfs_fs_devices *fs_devices; 830 struct btrfs_fs_devices *fs_devices;
@@ -1704,18 +1748,15 @@ static inline struct dentry *fdentry(struct file *file)
1704} 1748}
1705 1749
1706/* extent-tree.c */ 1750/* extent-tree.c */
1751int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1752 struct btrfs_root *root, unsigned long count);
1707int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1753int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1708int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1709 struct btrfs_root *root, u64 bytenr,
1710 u64 num_bytes, u32 *refs);
1711int btrfs_update_pinned_extents(struct btrfs_root *root, 1754int btrfs_update_pinned_extents(struct btrfs_root *root,
1712 u64 bytenr, u64 num, int pin); 1755 u64 bytenr, u64 num, int pin);
1713int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1756int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1714 struct btrfs_root *root, struct extent_buffer *leaf); 1757 struct btrfs_root *root, struct extent_buffer *leaf);
1715int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1758int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1716 struct btrfs_root *root, u64 objectid, u64 bytenr); 1759 struct btrfs_root *root, u64 objectid, u64 bytenr);
1717int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1718 struct btrfs_root *root);
1719int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); 1760int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1720struct btrfs_block_group_cache *btrfs_lookup_block_group( 1761struct btrfs_block_group_cache *btrfs_lookup_block_group(
1721 struct btrfs_fs_info *info, 1762 struct btrfs_fs_info *info,
@@ -1777,7 +1818,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1777 u64 root_objectid, u64 ref_generation, 1818 u64 root_objectid, u64 ref_generation,
1778 u64 owner_objectid); 1819 u64 owner_objectid);
1779int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 1820int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1780 struct btrfs_root *root, u64 bytenr, 1821 struct btrfs_root *root, u64 bytenr, u64 num_bytes,
1781 u64 orig_parent, u64 parent, 1822 u64 orig_parent, u64 parent,
1782 u64 root_objectid, u64 ref_generation, 1823 u64 root_objectid, u64 ref_generation,
1783 u64 owner_objectid); 1824 u64 owner_objectid);
@@ -1838,7 +1879,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1838int btrfs_cow_block(struct btrfs_trans_handle *trans, 1879int btrfs_cow_block(struct btrfs_trans_handle *trans,
1839 struct btrfs_root *root, struct extent_buffer *buf, 1880 struct btrfs_root *root, struct extent_buffer *buf,
1840 struct extent_buffer *parent, int parent_slot, 1881 struct extent_buffer *parent, int parent_slot,
1841 struct extent_buffer **cow_ret, u64 prealloc_dest); 1882 struct extent_buffer **cow_ret);
1842int btrfs_copy_root(struct btrfs_trans_handle *trans, 1883int btrfs_copy_root(struct btrfs_trans_handle *trans,
1843 struct btrfs_root *root, 1884 struct btrfs_root *root,
1844 struct extent_buffer *buf, 1885 struct extent_buffer *buf,
@@ -2060,7 +2101,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2060unsigned long btrfs_force_ra(struct address_space *mapping, 2101unsigned long btrfs_force_ra(struct address_space *mapping,
2061 struct file_ra_state *ra, struct file *file, 2102 struct file_ra_state *ra, struct file *file,
2062 pgoff_t offset, pgoff_t last_index); 2103 pgoff_t offset, pgoff_t last_index);
2063int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); 2104int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2064int btrfs_readpage(struct file *file, struct page *page); 2105int btrfs_readpage(struct file *file, struct page *page);
2065void btrfs_delete_inode(struct inode *inode); 2106void btrfs_delete_inode(struct inode *inode);
2066void btrfs_put_inode(struct inode *inode); 2107void btrfs_put_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 00000000000..cbf7dc8ae3e
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,669 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/sort.h>
21#include <linux/ftrace.h>
22#include "ctree.h"
23#include "delayed-ref.h"
24#include "transaction.h"
25
26/*
27 * delayed back reference update tracking. For subvolume trees
28 * we queue up extent allocations and backref maintenance for
29 * delayed processing. This avoids deep call chains where we
30 * add extents in the middle of btrfs_search_slot, and it allows
31 * us to buffer up frequently modified backrefs in an rb tree instead
32 * of hammering updates on the extent allocation tree.
33 *
34 * Right now this code is only used for reference counted trees, but
35 * the long term goal is to get rid of the similar code for delayed
36 * extent tree modifications.
37 */
38
39/*
40 * entries in the rb tree are ordered by the byte number of the extent
41 * and by the byte number of the parent block.
42 */
43static int comp_entry(struct btrfs_delayed_ref_node *ref,
44 u64 bytenr, u64 parent)
45{
46 if (bytenr < ref->bytenr)
47 return -1;
48 if (bytenr > ref->bytenr)
49 return 1;
50 if (parent < ref->parent)
51 return -1;
52 if (parent > ref->parent)
53 return 1;
54 return 0;
55}
56
57/*
58 * insert a new ref into the rbtree. This returns any existing refs
59 * for the same (bytenr,parent) tuple, or NULL if the new node was properly
60 * inserted.
61 */
62static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
63 u64 bytenr, u64 parent,
64 struct rb_node *node)
65{
66 struct rb_node **p = &root->rb_node;
67 struct rb_node *parent_node = NULL;
68 struct btrfs_delayed_ref_node *entry;
69 int cmp;
70
71 while (*p) {
72 parent_node = *p;
73 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
74 rb_node);
75
76 cmp = comp_entry(entry, bytenr, parent);
77 if (cmp < 0)
78 p = &(*p)->rb_left;
79 else if (cmp > 0)
80 p = &(*p)->rb_right;
81 else
82 return entry;
83 }
84
85 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
86 rb_link_node(node, parent_node, p);
87 rb_insert_color(node, root);
88 return NULL;
89}
90
91/*
92 * find an entry based on (bytenr,parent). This returns the delayed
93 * ref if it was able to find one, or NULL if nothing was in that spot
94 */
95static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
96 u64 bytenr, u64 parent,
97 struct btrfs_delayed_ref_node **last)
98{
99 struct rb_node *n = root->rb_node;
100 struct btrfs_delayed_ref_node *entry;
101 int cmp;
102
103 while (n) {
104 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
105 WARN_ON(!entry->in_tree);
106 if (last)
107 *last = entry;
108
109 cmp = comp_entry(entry, bytenr, parent);
110 if (cmp < 0)
111 n = n->rb_left;
112 else if (cmp > 0)
113 n = n->rb_right;
114 else
115 return entry;
116 }
117 return NULL;
118}
119
120int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
121 struct btrfs_delayed_ref_head *head)
122{
123 struct btrfs_delayed_ref_root *delayed_refs;
124
125 delayed_refs = &trans->transaction->delayed_refs;
126 assert_spin_locked(&delayed_refs->lock);
127 if (mutex_trylock(&head->mutex))
128 return 0;
129
130 atomic_inc(&head->node.refs);
131 spin_unlock(&delayed_refs->lock);
132
133 mutex_lock(&head->mutex);
134 spin_lock(&delayed_refs->lock);
135 if (!head->node.in_tree) {
136 mutex_unlock(&head->mutex);
137 btrfs_put_delayed_ref(&head->node);
138 return -EAGAIN;
139 }
140 btrfs_put_delayed_ref(&head->node);
141 return 0;
142}
143
144int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
145 struct list_head *cluster, u64 start)
146{
147 int count = 0;
148 struct btrfs_delayed_ref_root *delayed_refs;
149 struct rb_node *node;
150 struct btrfs_delayed_ref_node *ref;
151 struct btrfs_delayed_ref_head *head;
152
153 delayed_refs = &trans->transaction->delayed_refs;
154 if (start == 0) {
155 node = rb_first(&delayed_refs->root);
156 } else {
157 ref = NULL;
158 tree_search(&delayed_refs->root, start, (u64)-1, &ref);
159 if (ref) {
160 struct btrfs_delayed_ref_node *tmp;
161
162 node = rb_prev(&ref->rb_node);
163 while (node) {
164 tmp = rb_entry(node,
165 struct btrfs_delayed_ref_node,
166 rb_node);
167 if (tmp->bytenr < start)
168 break;
169 ref = tmp;
170 node = rb_prev(&ref->rb_node);
171 }
172 node = &ref->rb_node;
173 } else
174 node = rb_first(&delayed_refs->root);
175 }
176again:
177 while (node && count < 32) {
178 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
179 if (btrfs_delayed_ref_is_head(ref)) {
180 head = btrfs_delayed_node_to_head(ref);
181 if (list_empty(&head->cluster)) {
182 list_add_tail(&head->cluster, cluster);
183 delayed_refs->run_delayed_start =
184 head->node.bytenr;
185 count++;
186
187 WARN_ON(delayed_refs->num_heads_ready == 0);
188 delayed_refs->num_heads_ready--;
189 } else if (count) {
190 /* the goal of the clustering is to find extents
191 * that are likely to end up in the same extent
192 * leaf on disk. So, we don't want them spread
193 * all over the tree. Stop now if we've hit
194 * a head that was already in use
195 */
196 break;
197 }
198 }
199 node = rb_next(node);
200 }
201 if (count) {
202 return 0;
203 } else if (start) {
204 /*
205 * we've gone to the end of the rbtree without finding any
206 * clusters. start from the beginning and try again
207 */
208 start = 0;
209 node = rb_first(&delayed_refs->root);
210 goto again;
211 }
212 return 1;
213}
214
215/*
216 * This checks to see if there are any delayed refs in the
217 * btree for a given bytenr. It returns one if it finds any
218 * and zero otherwise.
219 *
220 * If it only finds a head node, it returns 0.
221 *
222 * The idea is to use this when deciding if you can safely delete an
223 * extent from the extent allocation tree. There may be a pending
224 * ref in the rbtree that adds or removes references, so as long as this
225 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
226 * allocation tree.
227 */
228int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
229{
230 struct btrfs_delayed_ref_node *ref;
231 struct btrfs_delayed_ref_root *delayed_refs;
232 struct rb_node *prev_node;
233 int ret = 0;
234
235 delayed_refs = &trans->transaction->delayed_refs;
236 spin_lock(&delayed_refs->lock);
237
238 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
239 if (ref) {
240 prev_node = rb_prev(&ref->rb_node);
241 if (!prev_node)
242 goto out;
243 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
244 rb_node);
245 if (ref->bytenr == bytenr)
246 ret = 1;
247 }
248out:
249 spin_unlock(&delayed_refs->lock);
250 return ret;
251}
252
253/*
254 * helper function to lookup reference count
255 *
256 * the head node for delayed ref is used to store the sum of all the
257 * reference count modifications queued up in the rbtree. This way you
258 * can check to see what the reference count would be if all of the
259 * delayed refs are processed.
260 */
261int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
262 struct btrfs_root *root, u64 bytenr,
263 u64 num_bytes, u32 *refs)
264{
265 struct btrfs_delayed_ref_node *ref;
266 struct btrfs_delayed_ref_head *head;
267 struct btrfs_delayed_ref_root *delayed_refs;
268 struct btrfs_path *path;
269 struct extent_buffer *leaf;
270 struct btrfs_extent_item *ei;
271 struct btrfs_key key;
272 u32 num_refs;
273 int ret;
274
275 path = btrfs_alloc_path();
276 if (!path)
277 return -ENOMEM;
278
279 key.objectid = bytenr;
280 key.type = BTRFS_EXTENT_ITEM_KEY;
281 key.offset = num_bytes;
282 delayed_refs = &trans->transaction->delayed_refs;
283again:
284 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
285 &key, path, 0, 0);
286 if (ret < 0)
287 goto out;
288
289 if (ret == 0) {
290 leaf = path->nodes[0];
291 ei = btrfs_item_ptr(leaf, path->slots[0],
292 struct btrfs_extent_item);
293 num_refs = btrfs_extent_refs(leaf, ei);
294 } else {
295 num_refs = 0;
296 ret = 0;
297 }
298
299 spin_lock(&delayed_refs->lock);
300 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
301 if (ref) {
302 head = btrfs_delayed_node_to_head(ref);
303 if (mutex_trylock(&head->mutex)) {
304 num_refs += ref->ref_mod;
305 mutex_unlock(&head->mutex);
306 *refs = num_refs;
307 goto out;
308 }
309
310 atomic_inc(&ref->refs);
311 spin_unlock(&delayed_refs->lock);
312
313 btrfs_release_path(root->fs_info->extent_root, path);
314
315 mutex_lock(&head->mutex);
316 mutex_unlock(&head->mutex);
317 btrfs_put_delayed_ref(ref);
318 goto again;
319 } else {
320 *refs = num_refs;
321 }
322out:
323 spin_unlock(&delayed_refs->lock);
324 btrfs_free_path(path);
325 return ret;
326}
327
328/*
329 * helper function to update an extent delayed ref in the
330 * rbtree. existing and update must both have the same
331 * bytenr and parent
332 *
333 * This may free existing if the update cancels out whatever
334 * operation it was doing.
335 */
336static noinline void
337update_existing_ref(struct btrfs_trans_handle *trans,
338 struct btrfs_delayed_ref_root *delayed_refs,
339 struct btrfs_delayed_ref_node *existing,
340 struct btrfs_delayed_ref_node *update)
341{
342 struct btrfs_delayed_ref *existing_ref;
343 struct btrfs_delayed_ref *ref;
344
345 existing_ref = btrfs_delayed_node_to_ref(existing);
346 ref = btrfs_delayed_node_to_ref(update);
347
348 if (ref->pin)
349 existing_ref->pin = 1;
350
351 if (ref->action != existing_ref->action) {
352 /*
353 * this is effectively undoing either an add or a
354 * drop. We decrement the ref_mod, and if it goes
355 * down to zero we just delete the entry without
356 * every changing the extent allocation tree.
357 */
358 existing->ref_mod--;
359 if (existing->ref_mod == 0) {
360 rb_erase(&existing->rb_node,
361 &delayed_refs->root);
362 existing->in_tree = 0;
363 btrfs_put_delayed_ref(existing);
364 delayed_refs->num_entries--;
365 if (trans->delayed_ref_updates)
366 trans->delayed_ref_updates--;
367 }
368 } else {
369 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
370 /* if we're adding refs, make sure all the
371 * details match up. The extent could
372 * have been totally freed and reallocated
373 * by a different owner before the delayed
374 * ref entries were removed.
375 */
376 existing_ref->owner_objectid = ref->owner_objectid;
377 existing_ref->generation = ref->generation;
378 existing_ref->root = ref->root;
379 existing->num_bytes = update->num_bytes;
380 }
381 /*
382 * the action on the existing ref matches
383 * the action on the ref we're trying to add.
384 * Bump the ref_mod by one so the backref that
385 * is eventually added/removed has the correct
386 * reference count
387 */
388 existing->ref_mod += update->ref_mod;
389 }
390}
391
392/*
393 * helper function to update the accounting in the head ref
394 * existing and update must have the same bytenr
395 */
396static noinline void
397update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
398 struct btrfs_delayed_ref_node *update)
399{
400 struct btrfs_delayed_ref_head *existing_ref;
401 struct btrfs_delayed_ref_head *ref;
402
403 existing_ref = btrfs_delayed_node_to_head(existing);
404 ref = btrfs_delayed_node_to_head(update);
405
406 if (ref->must_insert_reserved) {
407 /* if the extent was freed and then
408 * reallocated before the delayed ref
409 * entries were processed, we can end up
410 * with an existing head ref without
411 * the must_insert_reserved flag set.
412 * Set it again here
413 */
414 existing_ref->must_insert_reserved = ref->must_insert_reserved;
415
416 /*
417 * update the num_bytes so we make sure the accounting
418 * is done correctly
419 */
420 existing->num_bytes = update->num_bytes;
421
422 }
423
424 /*
425 * update the reference mod on the head to reflect this new operation
426 */
427 existing->ref_mod += update->ref_mod;
428}
429
430/*
431 * helper function to actually insert a delayed ref into the rbtree.
432 * this does all the dirty work in terms of maintaining the correct
433 * overall modification count in the head node and properly dealing
434 * with updating existing nodes as new modifications are queued.
435 */
436static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
437 struct btrfs_delayed_ref_node *ref,
438 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
439 u64 ref_generation, u64 owner_objectid, int action,
440 int pin)
441{
442 struct btrfs_delayed_ref_node *existing;
443 struct btrfs_delayed_ref *full_ref;
444 struct btrfs_delayed_ref_head *head_ref = NULL;
445 struct btrfs_delayed_ref_root *delayed_refs;
446 int count_mod = 1;
447 int must_insert_reserved = 0;
448
449 /*
450 * the head node stores the sum of all the mods, so dropping a ref
451 * should drop the sum in the head node by one.
452 */
453 if (parent == (u64)-1) {
454 if (action == BTRFS_DROP_DELAYED_REF)
455 count_mod = -1;
456 else if (action == BTRFS_UPDATE_DELAYED_HEAD)
457 count_mod = 0;
458 }
459
460 /*
461 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
462 * the reserved accounting when the extent is finally added, or
463 * if a later modification deletes the delayed ref without ever
464 * inserting the extent into the extent allocation tree.
465 * ref->must_insert_reserved is the flag used to record
466 * that accounting mods are required.
467 *
468 * Once we record must_insert_reserved, switch the action to
469 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
470 */
471 if (action == BTRFS_ADD_DELAYED_EXTENT) {
472 must_insert_reserved = 1;
473 action = BTRFS_ADD_DELAYED_REF;
474 } else {
475 must_insert_reserved = 0;
476 }
477
478
479 delayed_refs = &trans->transaction->delayed_refs;
480
481 /* first set the basic ref node struct up */
482 atomic_set(&ref->refs, 1);
483 ref->bytenr = bytenr;
484 ref->parent = parent;
485 ref->ref_mod = count_mod;
486 ref->in_tree = 1;
487 ref->num_bytes = num_bytes;
488
489 if (btrfs_delayed_ref_is_head(ref)) {
490 head_ref = btrfs_delayed_node_to_head(ref);
491 head_ref->must_insert_reserved = must_insert_reserved;
492 INIT_LIST_HEAD(&head_ref->cluster);
493 mutex_init(&head_ref->mutex);
494 } else {
495 full_ref = btrfs_delayed_node_to_ref(ref);
496 full_ref->root = ref_root;
497 full_ref->generation = ref_generation;
498 full_ref->owner_objectid = owner_objectid;
499 full_ref->pin = pin;
500 full_ref->action = action;
501 }
502
503 existing = tree_insert(&delayed_refs->root, bytenr,
504 parent, &ref->rb_node);
505
506 if (existing) {
507 if (btrfs_delayed_ref_is_head(ref))
508 update_existing_head_ref(existing, ref);
509 else
510 update_existing_ref(trans, delayed_refs, existing, ref);
511
512 /*
513 * we've updated the existing ref, free the newly
514 * allocated ref
515 */
516 kfree(ref);
517 } else {
518 if (btrfs_delayed_ref_is_head(ref)) {
519 delayed_refs->num_heads++;
520 delayed_refs->num_heads_ready++;
521 }
522 delayed_refs->num_entries++;
523 trans->delayed_ref_updates++;
524 }
525 return 0;
526}
527
528/*
529 * add a delayed ref to the tree. This does all of the accounting required
530 * to make sure the delayed ref is eventually processed before this
531 * transaction commits.
532 */
533int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
534 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
535 u64 ref_generation, u64 owner_objectid, int action,
536 int pin)
537{
538 struct btrfs_delayed_ref *ref;
539 struct btrfs_delayed_ref_head *head_ref;
540 struct btrfs_delayed_ref_root *delayed_refs;
541 int ret;
542
543 ref = kmalloc(sizeof(*ref), GFP_NOFS);
544 if (!ref)
545 return -ENOMEM;
546
547 /*
548 * the parent = 0 case comes from cases where we don't actually
549 * know the parent yet. It will get updated later via a add/drop
550 * pair.
551 */
552 if (parent == 0)
553 parent = bytenr;
554
555 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
556 if (!head_ref) {
557 kfree(ref);
558 return -ENOMEM;
559 }
560 delayed_refs = &trans->transaction->delayed_refs;
561 spin_lock(&delayed_refs->lock);
562
563 /*
564 * insert both the head node and the new ref without dropping
565 * the spin lock
566 */
567 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
568 (u64)-1, 0, 0, 0, action, pin);
569 BUG_ON(ret);
570
571 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
572 parent, ref_root, ref_generation,
573 owner_objectid, action, pin);
574 BUG_ON(ret);
575 spin_unlock(&delayed_refs->lock);
576 return 0;
577}
578
579/*
580 * this does a simple search for the head node for a given extent.
581 * It must be called with the delayed ref spinlock held, and it returns
582 * the head node if any where found, or NULL if not.
583 */
584struct btrfs_delayed_ref_head *
585btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
586{
587 struct btrfs_delayed_ref_node *ref;
588 struct btrfs_delayed_ref_root *delayed_refs;
589
590 delayed_refs = &trans->transaction->delayed_refs;
591 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
592 if (ref)
593 return btrfs_delayed_node_to_head(ref);
594 return NULL;
595}
596
597/*
598 * add a delayed ref to the tree. This does all of the accounting required
599 * to make sure the delayed ref is eventually processed before this
600 * transaction commits.
601 *
602 * The main point of this call is to add and remove a backreference in a single
603 * shot, taking the lock only once, and only searching for the head node once.
604 *
605 * It is the same as doing a ref add and delete in two separate calls.
606 */
607int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
608 u64 bytenr, u64 num_bytes, u64 orig_parent,
609 u64 parent, u64 orig_ref_root, u64 ref_root,
610 u64 orig_ref_generation, u64 ref_generation,
611 u64 owner_objectid, int pin)
612{
613 struct btrfs_delayed_ref *ref;
614 struct btrfs_delayed_ref *old_ref;
615 struct btrfs_delayed_ref_head *head_ref;
616 struct btrfs_delayed_ref_root *delayed_refs;
617 int ret;
618
619 ref = kmalloc(sizeof(*ref), GFP_NOFS);
620 if (!ref)
621 return -ENOMEM;
622
623 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
624 if (!old_ref) {
625 kfree(ref);
626 return -ENOMEM;
627 }
628
629 /*
630 * the parent = 0 case comes from cases where we don't actually
631 * know the parent yet. It will get updated later via a add/drop
632 * pair.
633 */
634 if (parent == 0)
635 parent = bytenr;
636 if (orig_parent == 0)
637 orig_parent = bytenr;
638
639 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
640 if (!head_ref) {
641 kfree(ref);
642 kfree(old_ref);
643 return -ENOMEM;
644 }
645 delayed_refs = &trans->transaction->delayed_refs;
646 spin_lock(&delayed_refs->lock);
647
648 /*
649 * insert both the head node and the new ref without dropping
650 * the spin lock
651 */
652 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
653 (u64)-1, 0, 0, 0,
654 BTRFS_UPDATE_DELAYED_HEAD, 0);
655 BUG_ON(ret);
656
657 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
658 parent, ref_root, ref_generation,
659 owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
660 BUG_ON(ret);
661
662 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
663 orig_parent, orig_ref_root,
664 orig_ref_generation, owner_objectid,
665 BTRFS_DROP_DELAYED_REF, pin);
666 BUG_ON(ret);
667 spin_unlock(&delayed_refs->lock);
668 return 0;
669}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 00000000000..3bec2ff0b15
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __DELAYED_REF__
19#define __DELAYED_REF__
20
21/* these are the possible values of struct btrfs_delayed_ref->action */
22#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
23#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
24#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
25#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
26
27struct btrfs_delayed_ref_node {
28 struct rb_node rb_node;
29
30 /* the starting bytenr of the extent */
31 u64 bytenr;
32
33 /* the parent our backref will point to */
34 u64 parent;
35
36 /* the size of the extent */
37 u64 num_bytes;
38
39 /* ref count on this data structure */
40 atomic_t refs;
41
42 /*
43 * how many refs is this entry adding or deleting. For
44 * head refs, this may be a negative number because it is keeping
45 * track of the total mods done to the reference count.
46 * For individual refs, this will always be a positive number
47 *
48 * It may be more than one, since it is possible for a single
49 * parent to have more than one ref on an extent
50 */
51 int ref_mod;
52
53 /* is this node still in the rbtree? */
54 unsigned int in_tree:1;
55};
56
57/*
58 * the head refs are used to hold a lock on a given extent, which allows us
59 * to make sure that only one process is running the delayed refs
60 * at a time for a single extent. They also store the sum of all the
61 * reference count modifications we've queued up.
62 */
63struct btrfs_delayed_ref_head {
64 struct btrfs_delayed_ref_node node;
65
66 /*
67 * the mutex is held while running the refs, and it is also
68 * held when checking the sum of reference modifications.
69 */
70 struct mutex mutex;
71
72 struct list_head cluster;
73
74 /*
75 * when a new extent is allocated, it is just reserved in memory
76 * The actual extent isn't inserted into the extent allocation tree
77 * until the delayed ref is processed. must_insert_reserved is
78 * used to flag a delayed ref so the accounting can be updated
79 * when a full insert is done.
80 *
81 * It is possible the extent will be freed before it is ever
82 * inserted into the extent allocation tree. In this case
83 * we need to update the in ram accounting to properly reflect
84 * the free has happened.
85 */
86 unsigned int must_insert_reserved:1;
87};
88
89struct btrfs_delayed_ref {
90 struct btrfs_delayed_ref_node node;
91
92 /* the root objectid our ref will point to */
93 u64 root;
94
95 /* the generation for the backref */
96 u64 generation;
97
98 /* owner_objectid of the backref */
99 u64 owner_objectid;
100
101 /* operation done by this entry in the rbtree */
102 u8 action;
103
104 /* if pin == 1, when the extent is freed it will be pinned until
105 * transaction commit
106 */
107 unsigned int pin:1;
108};
109
110struct btrfs_delayed_ref_root {
111 struct rb_root root;
112
113 /* this spin lock protects the rbtree and the entries inside */
114 spinlock_t lock;
115
116 /* how many delayed ref updates we've queued, used by the
117 * throttling code
118 */
119 unsigned long num_entries;
120
121 /* total number of head nodes in tree */
122 unsigned long num_heads;
123
124 /* total number of head nodes ready for processing */
125 unsigned long num_heads_ready;
126
127 /*
128 * set when the tree is flushing before a transaction commit,
129 * used by the throttling code to decide if new updates need
130 * to be run right away
131 */
132 int flushing;
133
134 u64 run_delayed_start;
135};
136
137static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
138{
139 WARN_ON(atomic_read(&ref->refs) == 0);
140 if (atomic_dec_and_test(&ref->refs)) {
141 WARN_ON(ref->in_tree);
142 kfree(ref);
143 }
144}
145
146int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
147 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
148 u64 ref_generation, u64 owner_objectid, int action,
149 int pin);
150
151struct btrfs_delayed_ref_head *
152btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
153int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
154int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
155 struct btrfs_root *root, u64 bytenr,
156 u64 num_bytes, u32 *refs);
157int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
158 u64 bytenr, u64 num_bytes, u64 orig_parent,
159 u64 parent, u64 orig_ref_root, u64 ref_root,
160 u64 orig_ref_generation, u64 ref_generation,
161 u64 owner_objectid, int pin);
162int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
163 struct btrfs_delayed_ref_head *head);
164int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
165 struct list_head *cluster, u64 search_start);
166/*
167 * a node might live in a head or a regular ref, this lets you
168 * test for the proper type to use.
169 */
170static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
171{
172 return node->parent == (u64)-1;
173}
174
175/*
176 * helper functions to cast a node into its container
177 */
178static inline struct btrfs_delayed_ref *
179btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
180{
181 WARN_ON(btrfs_delayed_ref_is_head(node));
182 return container_of(node, struct btrfs_delayed_ref, node);
183
184}
185
186static inline struct btrfs_delayed_ref_head *
187btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
188{
189 WARN_ON(!btrfs_delayed_ref_is_head(node));
190 return container_of(node, struct btrfs_delayed_ref_head, node);
191
192}
193#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7..1d70236ba00 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
145 key.objectid = dir; 145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len); 147 key.offset = btrfs_name_hash(name, name_len);
148
148 path = btrfs_alloc_path(); 149 path = btrfs_alloc_path();
150 path->leave_spinning = 1;
151
149 data_size = sizeof(*dir_item) + name_len; 152 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 153 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len); 154 name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ec80c0fc86..92d73929d38 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -668,14 +668,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
668static int btree_writepage(struct page *page, struct writeback_control *wbc) 668static int btree_writepage(struct page *page, struct writeback_control *wbc)
669{ 669{
670 struct extent_io_tree *tree; 670 struct extent_io_tree *tree;
671 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
672 struct extent_buffer *eb;
673 int was_dirty;
674
671 tree = &BTRFS_I(page->mapping->host)->io_tree; 675 tree = &BTRFS_I(page->mapping->host)->io_tree;
676 if (!(current->flags & PF_MEMALLOC)) {
677 return extent_write_full_page(tree, page,
678 btree_get_extent, wbc);
679 }
672 680
673 if (current->flags & PF_MEMALLOC) { 681 redirty_page_for_writepage(wbc, page);
674 redirty_page_for_writepage(wbc, page); 682 eb = btrfs_find_tree_block(root, page_offset(page),
675 unlock_page(page); 683 PAGE_CACHE_SIZE);
676 return 0; 684 WARN_ON(!eb);
685
686 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
687 if (!was_dirty) {
688 spin_lock(&root->fs_info->delalloc_lock);
689 root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
690 spin_unlock(&root->fs_info->delalloc_lock);
677 } 691 }
678 return extent_write_full_page(tree, page, btree_get_extent, wbc); 692 free_extent_buffer(eb);
693
694 unlock_page(page);
695 return 0;
679} 696}
680 697
681static int btree_writepages(struct address_space *mapping, 698static int btree_writepages(struct address_space *mapping,
@@ -684,15 +701,15 @@ static int btree_writepages(struct address_space *mapping,
684 struct extent_io_tree *tree; 701 struct extent_io_tree *tree;
685 tree = &BTRFS_I(mapping->host)->io_tree; 702 tree = &BTRFS_I(mapping->host)->io_tree;
686 if (wbc->sync_mode == WB_SYNC_NONE) { 703 if (wbc->sync_mode == WB_SYNC_NONE) {
704 struct btrfs_root *root = BTRFS_I(mapping->host)->root;
687 u64 num_dirty; 705 u64 num_dirty;
688 u64 start = 0;
689 unsigned long thresh = 32 * 1024 * 1024; 706 unsigned long thresh = 32 * 1024 * 1024;
690 707
691 if (wbc->for_kupdate) 708 if (wbc->for_kupdate)
692 return 0; 709 return 0;
693 710
694 num_dirty = count_range_bits(tree, &start, (u64)-1, 711 /* this is a bit racy, but that's ok */
695 thresh, EXTENT_DIRTY); 712 num_dirty = root->fs_info->dirty_metadata_bytes;
696 if (num_dirty < thresh) 713 if (num_dirty < thresh)
697 return 0; 714 return 0;
698 } 715 }
@@ -859,9 +876,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
859 root->fs_info->running_transaction->transid) { 876 root->fs_info->running_transaction->transid) {
860 btrfs_assert_tree_locked(buf); 877 btrfs_assert_tree_locked(buf);
861 878
862 /* ugh, clear_extent_buffer_dirty can be expensive */ 879 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
863 btrfs_set_lock_blocking(buf); 880 spin_lock(&root->fs_info->delalloc_lock);
881 if (root->fs_info->dirty_metadata_bytes >= buf->len)
882 root->fs_info->dirty_metadata_bytes -= buf->len;
883 else
884 WARN_ON(1);
885 spin_unlock(&root->fs_info->delalloc_lock);
886 }
864 887
888 /* ugh, clear_extent_buffer_dirty needs to lock the page */
889 btrfs_set_lock_blocking(buf);
865 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 890 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
866 buf); 891 buf);
867 } 892 }
@@ -1471,12 +1496,6 @@ static int transaction_kthread(void *arg)
1471 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1496 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1472 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1497 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1473 1498
1474 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1475 printk(KERN_INFO "btrfs: total reference cache "
1476 "size %llu\n",
1477 root->fs_info->total_ref_cache_size);
1478 }
1479
1480 mutex_lock(&root->fs_info->trans_mutex); 1499 mutex_lock(&root->fs_info->trans_mutex);
1481 cur = root->fs_info->running_transaction; 1500 cur = root->fs_info->running_transaction;
1482 if (!cur) { 1501 if (!cur) {
@@ -1493,6 +1512,7 @@ static int transaction_kthread(void *arg)
1493 mutex_unlock(&root->fs_info->trans_mutex); 1512 mutex_unlock(&root->fs_info->trans_mutex);
1494 trans = btrfs_start_transaction(root, 1); 1513 trans = btrfs_start_transaction(root, 1);
1495 ret = btrfs_commit_transaction(trans, root); 1514 ret = btrfs_commit_transaction(trans, root);
1515
1496sleep: 1516sleep:
1497 wake_up_process(root->fs_info->cleaner_kthread); 1517 wake_up_process(root->fs_info->cleaner_kthread);
1498 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1518 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1552 INIT_LIST_HEAD(&fs_info->dead_roots); 1572 INIT_LIST_HEAD(&fs_info->dead_roots);
1553 INIT_LIST_HEAD(&fs_info->hashers); 1573 INIT_LIST_HEAD(&fs_info->hashers);
1554 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1574 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1575 INIT_LIST_HEAD(&fs_info->ordered_operations);
1555 spin_lock_init(&fs_info->delalloc_lock); 1576 spin_lock_init(&fs_info->delalloc_lock);
1556 spin_lock_init(&fs_info->new_trans_lock); 1577 spin_lock_init(&fs_info->new_trans_lock);
1557 spin_lock_init(&fs_info->ref_cache_lock); 1578 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1611,10 +1632,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1611 1632
1612 extent_io_tree_init(&fs_info->pinned_extents, 1633 extent_io_tree_init(&fs_info->pinned_extents,
1613 fs_info->btree_inode->i_mapping, GFP_NOFS); 1634 fs_info->btree_inode->i_mapping, GFP_NOFS);
1614 extent_io_tree_init(&fs_info->pending_del,
1615 fs_info->btree_inode->i_mapping, GFP_NOFS);
1616 extent_io_tree_init(&fs_info->extent_ins,
1617 fs_info->btree_inode->i_mapping, GFP_NOFS);
1618 fs_info->do_barriers = 1; 1635 fs_info->do_barriers = 1;
1619 1636
1620 INIT_LIST_HEAD(&fs_info->dead_reloc_roots); 1637 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,9 +1644,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 insert_inode_hash(fs_info->btree_inode); 1644 insert_inode_hash(fs_info->btree_inode);
1628 1645
1629 mutex_init(&fs_info->trans_mutex); 1646 mutex_init(&fs_info->trans_mutex);
1647 mutex_init(&fs_info->ordered_operations_mutex);
1630 mutex_init(&fs_info->tree_log_mutex); 1648 mutex_init(&fs_info->tree_log_mutex);
1631 mutex_init(&fs_info->drop_mutex); 1649 mutex_init(&fs_info->drop_mutex);
1632 mutex_init(&fs_info->extent_ins_mutex);
1633 mutex_init(&fs_info->pinned_mutex); 1650 mutex_init(&fs_info->pinned_mutex);
1634 mutex_init(&fs_info->chunk_mutex); 1651 mutex_init(&fs_info->chunk_mutex);
1635 mutex_init(&fs_info->transaction_kthread_mutex); 1652 mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2358,8 +2375,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2358 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 2375 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2359 u64 transid = btrfs_header_generation(buf); 2376 u64 transid = btrfs_header_generation(buf);
2360 struct inode *btree_inode = root->fs_info->btree_inode; 2377 struct inode *btree_inode = root->fs_info->btree_inode;
2361 2378 int was_dirty;
2362 btrfs_set_lock_blocking(buf);
2363 2379
2364 btrfs_assert_tree_locked(buf); 2380 btrfs_assert_tree_locked(buf);
2365 if (transid != root->fs_info->generation) { 2381 if (transid != root->fs_info->generation) {
@@ -2370,7 +2386,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2370 (unsigned long long)root->fs_info->generation); 2386 (unsigned long long)root->fs_info->generation);
2371 WARN_ON(1); 2387 WARN_ON(1);
2372 } 2388 }
2373 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); 2389 was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
2390 buf);
2391 if (!was_dirty) {
2392 spin_lock(&root->fs_info->delalloc_lock);
2393 root->fs_info->dirty_metadata_bytes += buf->len;
2394 spin_unlock(&root->fs_info->delalloc_lock);
2395 }
2374} 2396}
2375 2397
2376void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 2398void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2410,6 +2432,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2410int btree_lock_page_hook(struct page *page) 2432int btree_lock_page_hook(struct page *page)
2411{ 2433{
2412 struct inode *inode = page->mapping->host; 2434 struct inode *inode = page->mapping->host;
2435 struct btrfs_root *root = BTRFS_I(inode)->root;
2413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2436 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2414 struct extent_buffer *eb; 2437 struct extent_buffer *eb;
2415 unsigned long len; 2438 unsigned long len;
@@ -2425,6 +2448,16 @@ int btree_lock_page_hook(struct page *page)
2425 2448
2426 btrfs_tree_lock(eb); 2449 btrfs_tree_lock(eb);
2427 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2450 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2451
2452 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
2453 spin_lock(&root->fs_info->delalloc_lock);
2454 if (root->fs_info->dirty_metadata_bytes >= eb->len)
2455 root->fs_info->dirty_metadata_bytes -= eb->len;
2456 else
2457 WARN_ON(1);
2458 spin_unlock(&root->fs_info->delalloc_lock);
2459 }
2460
2428 btrfs_tree_unlock(eb); 2461 btrfs_tree_unlock(eb);
2429 free_extent_buffer(eb); 2462 free_extent_buffer(eb);
2430out: 2463out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227b..c958ecbc191 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); 76int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf); 77int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root, 78int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad205..f5e7cae63d8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,17 +49,23 @@ struct pending_extent_op {
49 int del; 49 int del;
50}; 50};
51 51
52static int finish_current_insert(struct btrfs_trans_handle *trans, 52static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
53 struct btrfs_root *extent_root, int all); 53 struct btrfs_root *root, u64 parent,
54static int del_pending_extents(struct btrfs_trans_handle *trans, 54 u64 root_objectid, u64 ref_generation,
55 struct btrfs_root *extent_root, int all); 55 u64 owner, struct btrfs_key *ins,
56static int pin_down_bytes(struct btrfs_trans_handle *trans, 56 int ref_mod);
57 struct btrfs_root *root, 57static int update_reserved_extents(struct btrfs_root *root,
58 u64 bytenr, u64 num_bytes, int is_data); 58 u64 bytenr, u64 num, int reserve);
59static int update_block_group(struct btrfs_trans_handle *trans, 59static int update_block_group(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root, 60 struct btrfs_root *root,
61 u64 bytenr, u64 num_bytes, int alloc, 61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free); 62 int mark_free);
63static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root,
65 u64 bytenr, u64 num_bytes, u64 parent,
66 u64 root_objectid, u64 ref_generation,
67 u64 owner_objectid, int pin,
68 int ref_to_drop);
63 69
64static int do_chunk_alloc(struct btrfs_trans_handle *trans, 70static int do_chunk_alloc(struct btrfs_trans_handle *trans,
65 struct btrfs_root *extent_root, u64 alloc_bytes, 71 struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -554,262 +560,13 @@ out:
554 return ret; 560 return ret;
555} 561}
556 562
557/*
558 * updates all the backrefs that are pending on update_list for the
559 * extent_root
560 */
561static noinline int update_backrefs(struct btrfs_trans_handle *trans,
562 struct btrfs_root *extent_root,
563 struct btrfs_path *path,
564 struct list_head *update_list)
565{
566 struct btrfs_key key;
567 struct btrfs_extent_ref *ref;
568 struct btrfs_fs_info *info = extent_root->fs_info;
569 struct pending_extent_op *op;
570 struct extent_buffer *leaf;
571 int ret = 0;
572 struct list_head *cur = update_list->next;
573 u64 ref_objectid;
574 u64 ref_root = extent_root->root_key.objectid;
575
576 op = list_entry(cur, struct pending_extent_op, list);
577
578search:
579 key.objectid = op->bytenr;
580 key.type = BTRFS_EXTENT_REF_KEY;
581 key.offset = op->orig_parent;
582
583 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
584 BUG_ON(ret);
585
586 leaf = path->nodes[0];
587
588loop:
589 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590
591 ref_objectid = btrfs_ref_objectid(leaf, ref);
592
593 if (btrfs_ref_root(leaf, ref) != ref_root ||
594 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
595 (ref_objectid != op->level &&
596 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
597 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
598 "root %llu, owner %u\n",
599 (unsigned long long)op->bytenr,
600 (unsigned long long)op->orig_parent,
601 (unsigned long long)ref_root, op->level);
602 btrfs_print_leaf(extent_root, leaf);
603 BUG();
604 }
605
606 key.objectid = op->bytenr;
607 key.offset = op->parent;
608 key.type = BTRFS_EXTENT_REF_KEY;
609 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
610 BUG_ON(ret);
611 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
612 btrfs_set_ref_generation(leaf, ref, op->generation);
613
614 cur = cur->next;
615
616 list_del_init(&op->list);
617 unlock_extent(&info->extent_ins, op->bytenr,
618 op->bytenr + op->num_bytes - 1, GFP_NOFS);
619 kfree(op);
620
621 if (cur == update_list) {
622 btrfs_mark_buffer_dirty(path->nodes[0]);
623 btrfs_release_path(extent_root, path);
624 goto out;
625 }
626
627 op = list_entry(cur, struct pending_extent_op, list);
628
629 path->slots[0]++;
630 while (path->slots[0] < btrfs_header_nritems(leaf)) {
631 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
632 if (key.objectid == op->bytenr &&
633 key.type == BTRFS_EXTENT_REF_KEY)
634 goto loop;
635 path->slots[0]++;
636 }
637
638 btrfs_mark_buffer_dirty(path->nodes[0]);
639 btrfs_release_path(extent_root, path);
640 goto search;
641
642out:
643 return 0;
644}
645
646static noinline int insert_extents(struct btrfs_trans_handle *trans,
647 struct btrfs_root *extent_root,
648 struct btrfs_path *path,
649 struct list_head *insert_list, int nr)
650{
651 struct btrfs_key *keys;
652 u32 *data_size;
653 struct pending_extent_op *op;
654 struct extent_buffer *leaf;
655 struct list_head *cur = insert_list->next;
656 struct btrfs_fs_info *info = extent_root->fs_info;
657 u64 ref_root = extent_root->root_key.objectid;
658 int i = 0, last = 0, ret;
659 int total = nr * 2;
660
661 if (!nr)
662 return 0;
663
664 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
665 if (!keys)
666 return -ENOMEM;
667
668 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
669 if (!data_size) {
670 kfree(keys);
671 return -ENOMEM;
672 }
673
674 list_for_each_entry(op, insert_list, list) {
675 keys[i].objectid = op->bytenr;
676 keys[i].offset = op->num_bytes;
677 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
678 data_size[i] = sizeof(struct btrfs_extent_item);
679 i++;
680
681 keys[i].objectid = op->bytenr;
682 keys[i].offset = op->parent;
683 keys[i].type = BTRFS_EXTENT_REF_KEY;
684 data_size[i] = sizeof(struct btrfs_extent_ref);
685 i++;
686 }
687
688 op = list_entry(cur, struct pending_extent_op, list);
689 i = 0;
690 while (i < total) {
691 int c;
692 ret = btrfs_insert_some_items(trans, extent_root, path,
693 keys+i, data_size+i, total-i);
694 BUG_ON(ret < 0);
695
696 if (last && ret > 1)
697 BUG();
698
699 leaf = path->nodes[0];
700 for (c = 0; c < ret; c++) {
701 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
702
703 /*
704 * if the first item we inserted was a backref, then
705 * the EXTENT_ITEM will be the odd c's, else it will
706 * be the even c's
707 */
708 if ((ref_first && (c % 2)) ||
709 (!ref_first && !(c % 2))) {
710 struct btrfs_extent_item *itm;
711
712 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
713 struct btrfs_extent_item);
714 btrfs_set_extent_refs(path->nodes[0], itm, 1);
715 op->del++;
716 } else {
717 struct btrfs_extent_ref *ref;
718
719 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
720 struct btrfs_extent_ref);
721 btrfs_set_ref_root(leaf, ref, ref_root);
722 btrfs_set_ref_generation(leaf, ref,
723 op->generation);
724 btrfs_set_ref_objectid(leaf, ref, op->level);
725 btrfs_set_ref_num_refs(leaf, ref, 1);
726 op->del++;
727 }
728
729 /*
730 * using del to see when its ok to free up the
731 * pending_extent_op. In the case where we insert the
732 * last item on the list in order to help do batching
733 * we need to not free the extent op until we actually
734 * insert the extent_item
735 */
736 if (op->del == 2) {
737 unlock_extent(&info->extent_ins, op->bytenr,
738 op->bytenr + op->num_bytes - 1,
739 GFP_NOFS);
740 cur = cur->next;
741 list_del_init(&op->list);
742 kfree(op);
743 if (cur != insert_list)
744 op = list_entry(cur,
745 struct pending_extent_op,
746 list);
747 }
748 }
749 btrfs_mark_buffer_dirty(leaf);
750 btrfs_release_path(extent_root, path);
751
752 /*
753 * Ok backref's and items usually go right next to eachother,
754 * but if we could only insert 1 item that means that we
755 * inserted on the end of a leaf, and we have no idea what may
756 * be on the next leaf so we just play it safe. In order to
757 * try and help this case we insert the last thing on our
758 * insert list so hopefully it will end up being the last
759 * thing on the leaf and everything else will be before it,
760 * which will let us insert a whole bunch of items at the same
761 * time.
762 */
763 if (ret == 1 && !last && (i + ret < total)) {
764 /*
765 * last: where we will pick up the next time around
766 * i: our current key to insert, will be total - 1
767 * cur: the current op we are screwing with
768 * op: duh
769 */
770 last = i + ret;
771 i = total - 1;
772 cur = insert_list->prev;
773 op = list_entry(cur, struct pending_extent_op, list);
774 } else if (last) {
775 /*
776 * ok we successfully inserted the last item on the
777 * list, lets reset everything
778 *
779 * i: our current key to insert, so where we left off
780 * last time
781 * last: done with this
782 * cur: the op we are messing with
783 * op: duh
784 * total: since we inserted the last key, we need to
785 * decrement total so we dont overflow
786 */
787 i = last;
788 last = 0;
789 total--;
790 if (i < total) {
791 cur = insert_list->next;
792 op = list_entry(cur, struct pending_extent_op,
793 list);
794 }
795 } else {
796 i += ret;
797 }
798
799 cond_resched();
800 }
801 ret = 0;
802 kfree(keys);
803 kfree(data_size);
804 return ret;
805}
806
807static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, 563static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
808 struct btrfs_root *root, 564 struct btrfs_root *root,
809 struct btrfs_path *path, 565 struct btrfs_path *path,
810 u64 bytenr, u64 parent, 566 u64 bytenr, u64 parent,
811 u64 ref_root, u64 ref_generation, 567 u64 ref_root, u64 ref_generation,
812 u64 owner_objectid) 568 u64 owner_objectid,
569 int refs_to_add)
813{ 570{
814 struct btrfs_key key; 571 struct btrfs_key key;
815 struct extent_buffer *leaf; 572 struct extent_buffer *leaf;
@@ -829,9 +586,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
829 btrfs_set_ref_root(leaf, ref, ref_root); 586 btrfs_set_ref_root(leaf, ref, ref_root);
830 btrfs_set_ref_generation(leaf, ref, ref_generation); 587 btrfs_set_ref_generation(leaf, ref, ref_generation);
831 btrfs_set_ref_objectid(leaf, ref, owner_objectid); 588 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
832 btrfs_set_ref_num_refs(leaf, ref, 1); 589 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
833 } else if (ret == -EEXIST) { 590 } else if (ret == -EEXIST) {
834 u64 existing_owner; 591 u64 existing_owner;
592
835 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); 593 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
836 leaf = path->nodes[0]; 594 leaf = path->nodes[0];
837 ref = btrfs_item_ptr(leaf, path->slots[0], 595 ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +603,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
845 603
846 num_refs = btrfs_ref_num_refs(leaf, ref); 604 num_refs = btrfs_ref_num_refs(leaf, ref);
847 BUG_ON(num_refs == 0); 605 BUG_ON(num_refs == 0);
848 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); 606 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
849 607
850 existing_owner = btrfs_ref_objectid(leaf, ref); 608 existing_owner = btrfs_ref_objectid(leaf, ref);
851 if (existing_owner != owner_objectid && 609 if (existing_owner != owner_objectid &&
@@ -857,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
857 } else { 615 } else {
858 goto out; 616 goto out;
859 } 617 }
618 btrfs_unlock_up_safe(path, 1);
860 btrfs_mark_buffer_dirty(path->nodes[0]); 619 btrfs_mark_buffer_dirty(path->nodes[0]);
861out: 620out:
862 btrfs_release_path(root, path); 621 btrfs_release_path(root, path);
@@ -865,7 +624,8 @@ out:
865 624
866static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, 625static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
867 struct btrfs_root *root, 626 struct btrfs_root *root,
868 struct btrfs_path *path) 627 struct btrfs_path *path,
628 int refs_to_drop)
869{ 629{
870 struct extent_buffer *leaf; 630 struct extent_buffer *leaf;
871 struct btrfs_extent_ref *ref; 631 struct btrfs_extent_ref *ref;
@@ -875,8 +635,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
875 leaf = path->nodes[0]; 635 leaf = path->nodes[0];
876 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 636 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
877 num_refs = btrfs_ref_num_refs(leaf, ref); 637 num_refs = btrfs_ref_num_refs(leaf, ref);
878 BUG_ON(num_refs == 0); 638 BUG_ON(num_refs < refs_to_drop);
879 num_refs -= 1; 639 num_refs -= refs_to_drop;
880 if (num_refs == 0) { 640 if (num_refs == 0) {
881 ret = btrfs_del_item(trans, root, path); 641 ret = btrfs_del_item(trans, root, path);
882 } else { 642 } else {
@@ -927,332 +687,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
927#endif 687#endif
928} 688}
929 689
930static noinline int free_extents(struct btrfs_trans_handle *trans,
931 struct btrfs_root *extent_root,
932 struct list_head *del_list)
933{
934 struct btrfs_fs_info *info = extent_root->fs_info;
935 struct btrfs_path *path;
936 struct btrfs_key key, found_key;
937 struct extent_buffer *leaf;
938 struct list_head *cur;
939 struct pending_extent_op *op;
940 struct btrfs_extent_item *ei;
941 int ret, num_to_del, extent_slot = 0, found_extent = 0;
942 u32 refs;
943 u64 bytes_freed = 0;
944
945 path = btrfs_alloc_path();
946 if (!path)
947 return -ENOMEM;
948 path->reada = 1;
949
950search:
951 /* search for the backref for the current ref we want to delete */
952 cur = del_list->next;
953 op = list_entry(cur, struct pending_extent_op, list);
954 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
955 op->orig_parent,
956 extent_root->root_key.objectid,
957 op->orig_generation, op->level, 1);
958 if (ret) {
959 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
960 "root %llu gen %llu owner %u\n",
961 (unsigned long long)op->bytenr,
962 (unsigned long long)extent_root->root_key.objectid,
963 (unsigned long long)op->orig_generation, op->level);
964 btrfs_print_leaf(extent_root, path->nodes[0]);
965 WARN_ON(1);
966 goto out;
967 }
968
969 extent_slot = path->slots[0];
970 num_to_del = 1;
971 found_extent = 0;
972
973 /*
974 * if we aren't the first item on the leaf we can move back one and see
975 * if our ref is right next to our extent item
976 */
977 if (likely(extent_slot)) {
978 extent_slot--;
979 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
980 extent_slot);
981 if (found_key.objectid == op->bytenr &&
982 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
983 found_key.offset == op->num_bytes) {
984 num_to_del++;
985 found_extent = 1;
986 }
987 }
988
989 /*
990 * if we didn't find the extent we need to delete the backref and then
991 * search for the extent item key so we can update its ref count
992 */
993 if (!found_extent) {
994 key.objectid = op->bytenr;
995 key.type = BTRFS_EXTENT_ITEM_KEY;
996 key.offset = op->num_bytes;
997
998 ret = remove_extent_backref(trans, extent_root, path);
999 BUG_ON(ret);
1000 btrfs_release_path(extent_root, path);
1001 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1002 BUG_ON(ret);
1003 extent_slot = path->slots[0];
1004 }
1005
1006 /* this is where we update the ref count for the extent */
1007 leaf = path->nodes[0];
1008 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
1009 refs = btrfs_extent_refs(leaf, ei);
1010 BUG_ON(refs == 0);
1011 refs--;
1012 btrfs_set_extent_refs(leaf, ei, refs);
1013
1014 btrfs_mark_buffer_dirty(leaf);
1015
1016 /*
1017 * This extent needs deleting. The reason cur_slot is extent_slot +
1018 * num_to_del is because extent_slot points to the slot where the extent
1019 * is, and if the backref was not right next to the extent we will be
1020 * deleting at least 1 item, and will want to start searching at the
1021 * slot directly next to extent_slot. However if we did find the
1022 * backref next to the extent item them we will be deleting at least 2
1023 * items and will want to start searching directly after the ref slot
1024 */
1025 if (!refs) {
1026 struct list_head *pos, *n, *end;
1027 int cur_slot = extent_slot+num_to_del;
1028 u64 super_used;
1029 u64 root_used;
1030
1031 path->slots[0] = extent_slot;
1032 bytes_freed = op->num_bytes;
1033
1034 mutex_lock(&info->pinned_mutex);
1035 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1036 op->num_bytes, op->level >=
1037 BTRFS_FIRST_FREE_OBJECTID);
1038 mutex_unlock(&info->pinned_mutex);
1039 BUG_ON(ret < 0);
1040 op->del = ret;
1041
1042 /*
1043 * we need to see if we can delete multiple things at once, so
1044 * start looping through the list of extents we are wanting to
1045 * delete and see if their extent/backref's are right next to
1046 * eachother and the extents only have 1 ref
1047 */
1048 for (pos = cur->next; pos != del_list; pos = pos->next) {
1049 struct pending_extent_op *tmp;
1050
1051 tmp = list_entry(pos, struct pending_extent_op, list);
1052
1053 /* we only want to delete extent+ref at this stage */
1054 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1055 break;
1056
1057 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1058 if (found_key.objectid != tmp->bytenr ||
1059 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1060 found_key.offset != tmp->num_bytes)
1061 break;
1062
1063 /* check to make sure this extent only has one ref */
1064 ei = btrfs_item_ptr(leaf, cur_slot,
1065 struct btrfs_extent_item);
1066 if (btrfs_extent_refs(leaf, ei) != 1)
1067 break;
1068
1069 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1070 if (found_key.objectid != tmp->bytenr ||
1071 found_key.type != BTRFS_EXTENT_REF_KEY ||
1072 found_key.offset != tmp->orig_parent)
1073 break;
1074
1075 /*
1076 * the ref is right next to the extent, we can set the
1077 * ref count to 0 since we will delete them both now
1078 */
1079 btrfs_set_extent_refs(leaf, ei, 0);
1080
1081 /* pin down the bytes for this extent */
1082 mutex_lock(&info->pinned_mutex);
1083 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1084 tmp->num_bytes, tmp->level >=
1085 BTRFS_FIRST_FREE_OBJECTID);
1086 mutex_unlock(&info->pinned_mutex);
1087 BUG_ON(ret < 0);
1088
1089 /*
1090 * use the del field to tell if we need to go ahead and
1091 * free up the extent when we delete the item or not.
1092 */
1093 tmp->del = ret;
1094 bytes_freed += tmp->num_bytes;
1095
1096 num_to_del += 2;
1097 cur_slot += 2;
1098 }
1099 end = pos;
1100
1101 /* update the free space counters */
1102 spin_lock(&info->delalloc_lock);
1103 super_used = btrfs_super_bytes_used(&info->super_copy);
1104 btrfs_set_super_bytes_used(&info->super_copy,
1105 super_used - bytes_freed);
1106
1107 root_used = btrfs_root_used(&extent_root->root_item);
1108 btrfs_set_root_used(&extent_root->root_item,
1109 root_used - bytes_freed);
1110 spin_unlock(&info->delalloc_lock);
1111
1112 /* delete the items */
1113 ret = btrfs_del_items(trans, extent_root, path,
1114 path->slots[0], num_to_del);
1115 BUG_ON(ret);
1116
1117 /*
1118 * loop through the extents we deleted and do the cleanup work
1119 * on them
1120 */
1121 for (pos = cur, n = pos->next; pos != end;
1122 pos = n, n = pos->next) {
1123 struct pending_extent_op *tmp;
1124 tmp = list_entry(pos, struct pending_extent_op, list);
1125
1126 /*
1127 * remember tmp->del tells us wether or not we pinned
1128 * down the extent
1129 */
1130 ret = update_block_group(trans, extent_root,
1131 tmp->bytenr, tmp->num_bytes, 0,
1132 tmp->del);
1133 BUG_ON(ret);
1134
1135 list_del_init(&tmp->list);
1136 unlock_extent(&info->extent_ins, tmp->bytenr,
1137 tmp->bytenr + tmp->num_bytes - 1,
1138 GFP_NOFS);
1139 kfree(tmp);
1140 }
1141 } else if (refs && found_extent) {
1142 /*
1143 * the ref and extent were right next to eachother, but the
1144 * extent still has a ref, so just free the backref and keep
1145 * going
1146 */
1147 ret = remove_extent_backref(trans, extent_root, path);
1148 BUG_ON(ret);
1149
1150 list_del_init(&op->list);
1151 unlock_extent(&info->extent_ins, op->bytenr,
1152 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1153 kfree(op);
1154 } else {
1155 /*
1156 * the extent has multiple refs and the backref we were looking
1157 * for was not right next to it, so just unlock and go next,
1158 * we're good to go
1159 */
1160 list_del_init(&op->list);
1161 unlock_extent(&info->extent_ins, op->bytenr,
1162 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1163 kfree(op);
1164 }
1165
1166 btrfs_release_path(extent_root, path);
1167 if (!list_empty(del_list))
1168 goto search;
1169
1170out:
1171 btrfs_free_path(path);
1172 return ret;
1173}
1174
1175static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 690static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1176 struct btrfs_root *root, u64 bytenr, 691 struct btrfs_root *root, u64 bytenr,
692 u64 num_bytes,
1177 u64 orig_parent, u64 parent, 693 u64 orig_parent, u64 parent,
1178 u64 orig_root, u64 ref_root, 694 u64 orig_root, u64 ref_root,
1179 u64 orig_generation, u64 ref_generation, 695 u64 orig_generation, u64 ref_generation,
1180 u64 owner_objectid) 696 u64 owner_objectid)
1181{ 697{
1182 int ret; 698 int ret;
1183 struct btrfs_root *extent_root = root->fs_info->extent_root; 699 int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
1184 struct btrfs_path *path;
1185
1186 if (root == root->fs_info->extent_root) {
1187 struct pending_extent_op *extent_op;
1188 u64 num_bytes;
1189
1190 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1191 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1192 mutex_lock(&root->fs_info->extent_ins_mutex);
1193 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1194 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1195 u64 priv;
1196 ret = get_state_private(&root->fs_info->extent_ins,
1197 bytenr, &priv);
1198 BUG_ON(ret);
1199 extent_op = (struct pending_extent_op *)
1200 (unsigned long)priv;
1201 BUG_ON(extent_op->parent != orig_parent);
1202 BUG_ON(extent_op->generation != orig_generation);
1203 700
1204 extent_op->parent = parent; 701 ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
1205 extent_op->generation = ref_generation; 702 orig_parent, parent, orig_root,
1206 } else { 703 ref_root, orig_generation,
1207 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 704 ref_generation, owner_objectid, pin);
1208 BUG_ON(!extent_op);
1209
1210 extent_op->type = PENDING_BACKREF_UPDATE;
1211 extent_op->bytenr = bytenr;
1212 extent_op->num_bytes = num_bytes;
1213 extent_op->parent = parent;
1214 extent_op->orig_parent = orig_parent;
1215 extent_op->generation = ref_generation;
1216 extent_op->orig_generation = orig_generation;
1217 extent_op->level = (int)owner_objectid;
1218 INIT_LIST_HEAD(&extent_op->list);
1219 extent_op->del = 0;
1220
1221 set_extent_bits(&root->fs_info->extent_ins,
1222 bytenr, bytenr + num_bytes - 1,
1223 EXTENT_WRITEBACK, GFP_NOFS);
1224 set_state_private(&root->fs_info->extent_ins,
1225 bytenr, (unsigned long)extent_op);
1226 }
1227 mutex_unlock(&root->fs_info->extent_ins_mutex);
1228 return 0;
1229 }
1230
1231 path = btrfs_alloc_path();
1232 if (!path)
1233 return -ENOMEM;
1234 ret = lookup_extent_backref(trans, extent_root, path,
1235 bytenr, orig_parent, orig_root,
1236 orig_generation, owner_objectid, 1);
1237 if (ret)
1238 goto out;
1239 ret = remove_extent_backref(trans, extent_root, path);
1240 if (ret)
1241 goto out;
1242 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1243 parent, ref_root, ref_generation,
1244 owner_objectid);
1245 BUG_ON(ret); 705 BUG_ON(ret);
1246 finish_current_insert(trans, extent_root, 0);
1247 del_pending_extents(trans, extent_root, 0);
1248out:
1249 btrfs_free_path(path);
1250 return ret; 706 return ret;
1251} 707}
1252 708
1253int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 709int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1254 struct btrfs_root *root, u64 bytenr, 710 struct btrfs_root *root, u64 bytenr,
1255 u64 orig_parent, u64 parent, 711 u64 num_bytes, u64 orig_parent, u64 parent,
1256 u64 ref_root, u64 ref_generation, 712 u64 ref_root, u64 ref_generation,
1257 u64 owner_objectid) 713 u64 owner_objectid)
1258{ 714{
@@ -1260,20 +716,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1260 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 716 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1261 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 717 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1262 return 0; 718 return 0;
1263 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, 719
1264 parent, ref_root, ref_root, 720 ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
1265 ref_generation, ref_generation, 721 orig_parent, parent, ref_root,
1266 owner_objectid); 722 ref_root, ref_generation,
723 ref_generation, owner_objectid);
1267 return ret; 724 return ret;
1268} 725}
1269
1270static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 726static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1271 struct btrfs_root *root, u64 bytenr, 727 struct btrfs_root *root, u64 bytenr,
728 u64 num_bytes,
1272 u64 orig_parent, u64 parent, 729 u64 orig_parent, u64 parent,
1273 u64 orig_root, u64 ref_root, 730 u64 orig_root, u64 ref_root,
1274 u64 orig_generation, u64 ref_generation, 731 u64 orig_generation, u64 ref_generation,
1275 u64 owner_objectid) 732 u64 owner_objectid)
1276{ 733{
734 int ret;
735
736 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
737 ref_generation, owner_objectid,
738 BTRFS_ADD_DELAYED_REF, 0);
739 BUG_ON(ret);
740 return ret;
741}
742
743static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
744 struct btrfs_root *root, u64 bytenr,
745 u64 num_bytes, u64 parent, u64 ref_root,
746 u64 ref_generation, u64 owner_objectid,
747 int refs_to_add)
748{
1277 struct btrfs_path *path; 749 struct btrfs_path *path;
1278 int ret; 750 int ret;
1279 struct btrfs_key key; 751 struct btrfs_key key;
@@ -1286,17 +758,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1286 return -ENOMEM; 758 return -ENOMEM;
1287 759
1288 path->reada = 1; 760 path->reada = 1;
761 path->leave_spinning = 1;
1289 key.objectid = bytenr; 762 key.objectid = bytenr;
1290 key.type = BTRFS_EXTENT_ITEM_KEY; 763 key.type = BTRFS_EXTENT_ITEM_KEY;
1291 key.offset = (u64)-1; 764 key.offset = num_bytes;
1292 765
1293 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 766 /* first find the extent item and update its reference count */
1294 0, 1); 767 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1295 if (ret < 0) 768 path, 0, 1);
769 if (ret < 0) {
770 btrfs_set_path_blocking(path);
1296 return ret; 771 return ret;
1297 BUG_ON(ret == 0 || path->slots[0] == 0); 772 }
1298 773
1299 path->slots[0]--; 774 if (ret > 0) {
775 WARN_ON(1);
776 btrfs_free_path(path);
777 return -EIO;
778 }
1300 l = path->nodes[0]; 779 l = path->nodes[0];
1301 780
1302 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 781 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +789,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1310 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); 789 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1311 790
1312 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); 791 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
792
1313 refs = btrfs_extent_refs(l, item); 793 refs = btrfs_extent_refs(l, item);
1314 btrfs_set_extent_refs(l, item, refs + 1); 794 btrfs_set_extent_refs(l, item, refs + refs_to_add);
795 btrfs_unlock_up_safe(path, 1);
796
1315 btrfs_mark_buffer_dirty(path->nodes[0]); 797 btrfs_mark_buffer_dirty(path->nodes[0]);
1316 798
1317 btrfs_release_path(root->fs_info->extent_root, path); 799 btrfs_release_path(root->fs_info->extent_root, path);
1318 800
1319 path->reada = 1; 801 path->reada = 1;
802 path->leave_spinning = 1;
803
804 /* now insert the actual backref */
1320 ret = insert_extent_backref(trans, root->fs_info->extent_root, 805 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1321 path, bytenr, parent, 806 path, bytenr, parent,
1322 ref_root, ref_generation, 807 ref_root, ref_generation,
1323 owner_objectid); 808 owner_objectid, refs_to_add);
1324 BUG_ON(ret); 809 BUG_ON(ret);
1325 finish_current_insert(trans, root->fs_info->extent_root, 0);
1326 del_pending_extents(trans, root->fs_info->extent_root, 0);
1327
1328 btrfs_free_path(path); 810 btrfs_free_path(path);
1329 return 0; 811 return 0;
1330} 812}
@@ -1339,68 +821,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1339 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 821 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1340 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 822 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1341 return 0; 823 return 0;
1342 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, 824
825 ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
1343 0, ref_root, 0, ref_generation, 826 0, ref_root, 0, ref_generation,
1344 owner_objectid); 827 owner_objectid);
1345 return ret; 828 return ret;
1346} 829}
1347 830
1348int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 831static int drop_delayed_ref(struct btrfs_trans_handle *trans,
1349 struct btrfs_root *root) 832 struct btrfs_root *root,
833 struct btrfs_delayed_ref_node *node)
834{
835 int ret = 0;
836 struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
837
838 BUG_ON(node->ref_mod == 0);
839 ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
840 node->parent, ref->root, ref->generation,
841 ref->owner_objectid, ref->pin, node->ref_mod);
842
843 return ret;
844}
845
846/* helper function to actually process a single delayed ref entry */
847static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
848 struct btrfs_root *root,
849 struct btrfs_delayed_ref_node *node,
850 int insert_reserved)
1350{ 851{
1351 u64 start;
1352 u64 end;
1353 int ret; 852 int ret;
853 struct btrfs_delayed_ref *ref;
854
855 if (node->parent == (u64)-1) {
856 struct btrfs_delayed_ref_head *head;
857 /*
858 * we've hit the end of the chain and we were supposed
859 * to insert this extent into the tree. But, it got
860 * deleted before we ever needed to insert it, so all
861 * we have to do is clean up the accounting
862 */
863 if (insert_reserved) {
864 update_reserved_extents(root, node->bytenr,
865 node->num_bytes, 0);
866 }
867 head = btrfs_delayed_node_to_head(node);
868 mutex_unlock(&head->mutex);
869 return 0;
870 }
1354 871
1355 while(1) { 872 ref = btrfs_delayed_node_to_ref(node);
1356 finish_current_insert(trans, root->fs_info->extent_root, 1); 873 if (ref->action == BTRFS_ADD_DELAYED_REF) {
1357 del_pending_extents(trans, root->fs_info->extent_root, 1); 874 if (insert_reserved) {
875 struct btrfs_key ins;
1358 876
1359 /* is there more work to do? */ 877 ins.objectid = node->bytenr;
1360 ret = find_first_extent_bit(&root->fs_info->pending_del, 878 ins.offset = node->num_bytes;
1361 0, &start, &end, EXTENT_WRITEBACK); 879 ins.type = BTRFS_EXTENT_ITEM_KEY;
1362 if (!ret) 880
1363 continue; 881 /* record the full extent allocation */
1364 ret = find_first_extent_bit(&root->fs_info->extent_ins, 882 ret = __btrfs_alloc_reserved_extent(trans, root,
1365 0, &start, &end, EXTENT_WRITEBACK); 883 node->parent, ref->root,
1366 if (!ret) 884 ref->generation, ref->owner_objectid,
1367 continue; 885 &ins, node->ref_mod);
1368 break; 886 update_reserved_extents(root, node->bytenr,
887 node->num_bytes, 0);
888 } else {
889 /* just add one backref */
890 ret = add_extent_ref(trans, root, node->bytenr,
891 node->num_bytes,
892 node->parent, ref->root, ref->generation,
893 ref->owner_objectid, node->ref_mod);
894 }
895 BUG_ON(ret);
896 } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
897 WARN_ON(insert_reserved);
898 ret = drop_delayed_ref(trans, root, node);
1369 } 899 }
1370 return 0; 900 return 0;
1371} 901}
1372 902
1373int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 903static noinline struct btrfs_delayed_ref_node *
1374 struct btrfs_root *root, u64 bytenr, 904select_delayed_ref(struct btrfs_delayed_ref_head *head)
1375 u64 num_bytes, u32 *refs)
1376{ 905{
1377 struct btrfs_path *path; 906 struct rb_node *node;
907 struct btrfs_delayed_ref_node *ref;
908 int action = BTRFS_ADD_DELAYED_REF;
909again:
910 /*
911 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
912 * this prevents ref count from going down to zero when
913 * there still are pending delayed ref.
914 */
915 node = rb_prev(&head->node.rb_node);
916 while (1) {
917 if (!node)
918 break;
919 ref = rb_entry(node, struct btrfs_delayed_ref_node,
920 rb_node);
921 if (ref->bytenr != head->node.bytenr)
922 break;
923 if (btrfs_delayed_node_to_ref(ref)->action == action)
924 return ref;
925 node = rb_prev(node);
926 }
927 if (action == BTRFS_ADD_DELAYED_REF) {
928 action = BTRFS_DROP_DELAYED_REF;
929 goto again;
930 }
931 return NULL;
932}
933
934static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
935 struct btrfs_root *root,
936 struct list_head *cluster)
937{
938 struct btrfs_delayed_ref_root *delayed_refs;
939 struct btrfs_delayed_ref_node *ref;
940 struct btrfs_delayed_ref_head *locked_ref = NULL;
1378 int ret; 941 int ret;
1379 struct btrfs_key key; 942 int count = 0;
1380 struct extent_buffer *l; 943 int must_insert_reserved = 0;
1381 struct btrfs_extent_item *item;
1382 944
1383 WARN_ON(num_bytes < root->sectorsize); 945 delayed_refs = &trans->transaction->delayed_refs;
1384 path = btrfs_alloc_path(); 946 while (1) {
1385 path->reada = 1; 947 if (!locked_ref) {
1386 key.objectid = bytenr; 948 /* pick a new head ref from the cluster list */
1387 key.offset = num_bytes; 949 if (list_empty(cluster))
1388 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 950 break;
1389 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 951
1390 0, 0); 952 locked_ref = list_entry(cluster->next,
1391 if (ret < 0) 953 struct btrfs_delayed_ref_head, cluster);
1392 goto out; 954
1393 if (ret != 0) { 955 /* grab the lock that says we are going to process
1394 btrfs_print_leaf(root, path->nodes[0]); 956 * all the refs for this head */
1395 printk(KERN_INFO "btrfs failed to find block number %llu\n", 957 ret = btrfs_delayed_ref_lock(trans, locked_ref);
1396 (unsigned long long)bytenr); 958
1397 BUG(); 959 /*
960 * we may have dropped the spin lock to get the head
961 * mutex lock, and that might have given someone else
962 * time to free the head. If that's true, it has been
963 * removed from our list and we can move on.
964 */
965 if (ret == -EAGAIN) {
966 locked_ref = NULL;
967 count++;
968 continue;
969 }
970 }
971
972 /*
973 * record the must insert reserved flag before we
974 * drop the spin lock.
975 */
976 must_insert_reserved = locked_ref->must_insert_reserved;
977 locked_ref->must_insert_reserved = 0;
978
979 /*
980 * locked_ref is the head node, so we have to go one
981 * node back for any delayed ref updates
982 */
983 ref = select_delayed_ref(locked_ref);
984 if (!ref) {
985 /* All delayed refs have been processed, Go ahead
986 * and send the head node to run_one_delayed_ref,
987 * so that any accounting fixes can happen
988 */
989 ref = &locked_ref->node;
990 list_del_init(&locked_ref->cluster);
991 locked_ref = NULL;
992 }
993
994 ref->in_tree = 0;
995 rb_erase(&ref->rb_node, &delayed_refs->root);
996 delayed_refs->num_entries--;
997 spin_unlock(&delayed_refs->lock);
998
999 ret = run_one_delayed_ref(trans, root, ref,
1000 must_insert_reserved);
1001 BUG_ON(ret);
1002 btrfs_put_delayed_ref(ref);
1003
1004 count++;
1005 cond_resched();
1006 spin_lock(&delayed_refs->lock);
1007 }
1008 return count;
1009}
1010
1011/*
1012 * this starts processing the delayed reference count updates and
1013 * extent insertions we have queued up so far. count can be
1014 * 0, which means to process everything in the tree at the start
1015 * of the run (but not newly added entries), or it can be some target
1016 * number you'd like to process.
1017 */
1018int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1019 struct btrfs_root *root, unsigned long count)
1020{
1021 struct rb_node *node;
1022 struct btrfs_delayed_ref_root *delayed_refs;
1023 struct btrfs_delayed_ref_node *ref;
1024 struct list_head cluster;
1025 int ret;
1026 int run_all = count == (unsigned long)-1;
1027 int run_most = 0;
1028
1029 if (root == root->fs_info->extent_root)
1030 root = root->fs_info->tree_root;
1031
1032 delayed_refs = &trans->transaction->delayed_refs;
1033 INIT_LIST_HEAD(&cluster);
1034again:
1035 spin_lock(&delayed_refs->lock);
1036 if (count == 0) {
1037 count = delayed_refs->num_entries * 2;
1038 run_most = 1;
1039 }
1040 while (1) {
1041 if (!(run_all || run_most) &&
1042 delayed_refs->num_heads_ready < 64)
1043 break;
1044
1045 /*
1046 * go find something we can process in the rbtree. We start at
1047 * the beginning of the tree, and then build a cluster
1048 * of refs to process starting at the first one we are able to
1049 * lock
1050 */
1051 ret = btrfs_find_ref_cluster(trans, &cluster,
1052 delayed_refs->run_delayed_start);
1053 if (ret)
1054 break;
1055
1056 ret = run_clustered_refs(trans, root, &cluster);
1057 BUG_ON(ret < 0);
1058
1059 count -= min_t(unsigned long, ret, count);
1060
1061 if (count == 0)
1062 break;
1063 }
1064
1065 if (run_all) {
1066 node = rb_first(&delayed_refs->root);
1067 if (!node)
1068 goto out;
1069 count = (unsigned long)-1;
1070
1071 while (node) {
1072 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1073 rb_node);
1074 if (btrfs_delayed_ref_is_head(ref)) {
1075 struct btrfs_delayed_ref_head *head;
1076
1077 head = btrfs_delayed_node_to_head(ref);
1078 atomic_inc(&ref->refs);
1079
1080 spin_unlock(&delayed_refs->lock);
1081 mutex_lock(&head->mutex);
1082 mutex_unlock(&head->mutex);
1083
1084 btrfs_put_delayed_ref(ref);
1085 cond_resched();
1086 goto again;
1087 }
1088 node = rb_next(node);
1089 }
1090 spin_unlock(&delayed_refs->lock);
1091 schedule_timeout(1);
1092 goto again;
1398 } 1093 }
1399 l = path->nodes[0];
1400 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1401 *refs = btrfs_extent_refs(l, item);
1402out: 1094out:
1403 btrfs_free_path(path); 1095 spin_unlock(&delayed_refs->lock);
1404 return 0; 1096 return 0;
1405} 1097}
1406 1098
@@ -1624,7 +1316,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1624 int refi = 0; 1316 int refi = 0;
1625 int slot; 1317 int slot;
1626 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1318 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1627 u64, u64, u64, u64, u64, u64, u64, u64); 1319 u64, u64, u64, u64, u64, u64, u64, u64, u64);
1628 1320
1629 ref_root = btrfs_header_owner(buf); 1321 ref_root = btrfs_header_owner(buf);
1630 ref_generation = btrfs_header_generation(buf); 1322 ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1388,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1696 1388
1697 if (level == 0) { 1389 if (level == 0) {
1698 btrfs_item_key_to_cpu(buf, &key, slot); 1390 btrfs_item_key_to_cpu(buf, &key, slot);
1391 fi = btrfs_item_ptr(buf, slot,
1392 struct btrfs_file_extent_item);
1393
1394 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1395 if (bytenr == 0)
1396 continue;
1699 1397
1700 ret = process_func(trans, root, bytenr, 1398 ret = process_func(trans, root, bytenr,
1701 orig_buf->start, buf->start, 1399 btrfs_file_extent_disk_num_bytes(buf, fi),
1702 orig_root, ref_root, 1400 orig_buf->start, buf->start,
1703 orig_generation, ref_generation, 1401 orig_root, ref_root,
1704 key.objectid); 1402 orig_generation, ref_generation,
1403 key.objectid);
1705 1404
1706 if (ret) { 1405 if (ret) {
1707 faili = slot; 1406 faili = slot;
@@ -1709,7 +1408,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1709 goto fail; 1408 goto fail;
1710 } 1409 }
1711 } else { 1410 } else {
1712 ret = process_func(trans, root, bytenr, 1411 ret = process_func(trans, root, bytenr, buf->len,
1713 orig_buf->start, buf->start, 1412 orig_buf->start, buf->start,
1714 orig_root, ref_root, 1413 orig_root, ref_root,
1715 orig_generation, ref_generation, 1414 orig_generation, ref_generation,
@@ -1786,17 +1485,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
1786 if (bytenr == 0) 1485 if (bytenr == 0)
1787 continue; 1486 continue;
1788 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1487 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1789 orig_buf->start, buf->start, 1488 btrfs_file_extent_disk_num_bytes(buf, fi),
1790 orig_root, ref_root, 1489 orig_buf->start, buf->start,
1791 orig_generation, ref_generation, 1490 orig_root, ref_root, orig_generation,
1792 key.objectid); 1491 ref_generation, key.objectid);
1793 if (ret) 1492 if (ret)
1794 goto fail; 1493 goto fail;
1795 } else { 1494 } else {
1796 bytenr = btrfs_node_blockptr(buf, slot); 1495 bytenr = btrfs_node_blockptr(buf, slot);
1797 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1496 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1798 orig_buf->start, buf->start, 1497 buf->len, orig_buf->start,
1799 orig_root, ref_root, 1498 buf->start, orig_root, ref_root,
1800 orig_generation, ref_generation, 1499 orig_generation, ref_generation,
1801 level - 1); 1500 level - 1);
1802 if (ret) 1501 if (ret)
@@ -1815,7 +1514,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1815 struct btrfs_block_group_cache *cache) 1514 struct btrfs_block_group_cache *cache)
1816{ 1515{
1817 int ret; 1516 int ret;
1818 int pending_ret;
1819 struct btrfs_root *extent_root = root->fs_info->extent_root; 1517 struct btrfs_root *extent_root = root->fs_info->extent_root;
1820 unsigned long bi; 1518 unsigned long bi;
1821 struct extent_buffer *leaf; 1519 struct extent_buffer *leaf;
@@ -1831,12 +1529,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1831 btrfs_mark_buffer_dirty(leaf); 1529 btrfs_mark_buffer_dirty(leaf);
1832 btrfs_release_path(extent_root, path); 1530 btrfs_release_path(extent_root, path);
1833fail: 1531fail:
1834 finish_current_insert(trans, extent_root, 0);
1835 pending_ret = del_pending_extents(trans, extent_root, 0);
1836 if (ret) 1532 if (ret)
1837 return ret; 1533 return ret;
1838 if (pending_ret)
1839 return pending_ret;
1840 return 0; 1534 return 0;
1841 1535
1842} 1536}
@@ -2361,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2361 clear_extent_dirty(&fs_info->pinned_extents, 2055 clear_extent_dirty(&fs_info->pinned_extents,
2362 bytenr, bytenr + num - 1, GFP_NOFS); 2056 bytenr, bytenr + num - 1, GFP_NOFS);
2363 } 2057 }
2058 mutex_unlock(&root->fs_info->pinned_mutex);
2059
2364 while (num > 0) { 2060 while (num > 0) {
2365 cache = btrfs_lookup_block_group(fs_info, bytenr); 2061 cache = btrfs_lookup_block_group(fs_info, bytenr);
2366 BUG_ON(!cache); 2062 BUG_ON(!cache);
@@ -2452,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2452 u64 end; 2148 u64 end;
2453 int ret; 2149 int ret;
2454 2150
2455 mutex_lock(&root->fs_info->pinned_mutex);
2456 while (1) { 2151 while (1) {
2152 mutex_lock(&root->fs_info->pinned_mutex);
2457 ret = find_first_extent_bit(unpin, 0, &start, &end, 2153 ret = find_first_extent_bit(unpin, 0, &start, &end,
2458 EXTENT_DIRTY); 2154 EXTENT_DIRTY);
2459 if (ret) 2155 if (ret)
@@ -2461,209 +2157,21 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2461 2157
2462 ret = btrfs_discard_extent(root, start, end + 1 - start); 2158 ret = btrfs_discard_extent(root, start, end + 1 - start);
2463 2159
2160 /* unlocks the pinned mutex */
2464 btrfs_update_pinned_extents(root, start, end + 1 - start, 0); 2161 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2465 clear_extent_dirty(unpin, start, end, GFP_NOFS); 2162 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2466 2163
2467 if (need_resched()) { 2164 cond_resched();
2468 mutex_unlock(&root->fs_info->pinned_mutex);
2469 cond_resched();
2470 mutex_lock(&root->fs_info->pinned_mutex);
2471 }
2472 } 2165 }
2473 mutex_unlock(&root->fs_info->pinned_mutex); 2166 mutex_unlock(&root->fs_info->pinned_mutex);
2474 return ret; 2167 return ret;
2475} 2168}
2476 2169
2477static int finish_current_insert(struct btrfs_trans_handle *trans,
2478 struct btrfs_root *extent_root, int all)
2479{
2480 u64 start;
2481 u64 end;
2482 u64 priv;
2483 u64 search = 0;
2484 struct btrfs_fs_info *info = extent_root->fs_info;
2485 struct btrfs_path *path;
2486 struct pending_extent_op *extent_op, *tmp;
2487 struct list_head insert_list, update_list;
2488 int ret;
2489 int num_inserts = 0, max_inserts, restart = 0;
2490
2491 path = btrfs_alloc_path();
2492 INIT_LIST_HEAD(&insert_list);
2493 INIT_LIST_HEAD(&update_list);
2494
2495 max_inserts = extent_root->leafsize /
2496 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2497 sizeof(struct btrfs_extent_ref) +
2498 sizeof(struct btrfs_extent_item));
2499again:
2500 mutex_lock(&info->extent_ins_mutex);
2501 while (1) {
2502 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2503 &end, EXTENT_WRITEBACK);
2504 if (ret) {
2505 if (restart && !num_inserts &&
2506 list_empty(&update_list)) {
2507 restart = 0;
2508 search = 0;
2509 continue;
2510 }
2511 break;
2512 }
2513
2514 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2515 if (!ret) {
2516 if (all)
2517 restart = 1;
2518 search = end + 1;
2519 if (need_resched()) {
2520 mutex_unlock(&info->extent_ins_mutex);
2521 cond_resched();
2522 mutex_lock(&info->extent_ins_mutex);
2523 }
2524 continue;
2525 }
2526
2527 ret = get_state_private(&info->extent_ins, start, &priv);
2528 BUG_ON(ret);
2529 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2530
2531 if (extent_op->type == PENDING_EXTENT_INSERT) {
2532 num_inserts++;
2533 list_add_tail(&extent_op->list, &insert_list);
2534 search = end + 1;
2535 if (num_inserts == max_inserts) {
2536 restart = 1;
2537 break;
2538 }
2539 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2540 list_add_tail(&extent_op->list, &update_list);
2541 search = end + 1;
2542 } else {
2543 BUG();
2544 }
2545 }
2546
2547 /*
2548 * process the update list, clear the writeback bit for it, and if
2549 * somebody marked this thing for deletion then just unlock it and be
2550 * done, the free_extents will handle it
2551 */
2552 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2553 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2554 extent_op->bytenr + extent_op->num_bytes - 1,
2555 EXTENT_WRITEBACK, GFP_NOFS);
2556 if (extent_op->del) {
2557 list_del_init(&extent_op->list);
2558 unlock_extent(&info->extent_ins, extent_op->bytenr,
2559 extent_op->bytenr + extent_op->num_bytes
2560 - 1, GFP_NOFS);
2561 kfree(extent_op);
2562 }
2563 }
2564 mutex_unlock(&info->extent_ins_mutex);
2565
2566 /*
2567 * still have things left on the update list, go ahead an update
2568 * everything
2569 */
2570 if (!list_empty(&update_list)) {
2571 ret = update_backrefs(trans, extent_root, path, &update_list);
2572 BUG_ON(ret);
2573
2574 /* we may have COW'ed new blocks, so lets start over */
2575 if (all)
2576 restart = 1;
2577 }
2578
2579 /*
2580 * if no inserts need to be done, but we skipped some extents and we
2581 * need to make sure everything is cleaned then reset everything and
2582 * go back to the beginning
2583 */
2584 if (!num_inserts && restart) {
2585 search = 0;
2586 restart = 0;
2587 INIT_LIST_HEAD(&update_list);
2588 INIT_LIST_HEAD(&insert_list);
2589 goto again;
2590 } else if (!num_inserts) {
2591 goto out;
2592 }
2593
2594 /*
2595 * process the insert extents list. Again if we are deleting this
2596 * extent, then just unlock it, pin down the bytes if need be, and be
2597 * done with it. Saves us from having to actually insert the extent
2598 * into the tree and then subsequently come along and delete it
2599 */
2600 mutex_lock(&info->extent_ins_mutex);
2601 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2602 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2603 extent_op->bytenr + extent_op->num_bytes - 1,
2604 EXTENT_WRITEBACK, GFP_NOFS);
2605 if (extent_op->del) {
2606 u64 used;
2607 list_del_init(&extent_op->list);
2608 unlock_extent(&info->extent_ins, extent_op->bytenr,
2609 extent_op->bytenr + extent_op->num_bytes
2610 - 1, GFP_NOFS);
2611
2612 mutex_lock(&extent_root->fs_info->pinned_mutex);
2613 ret = pin_down_bytes(trans, extent_root,
2614 extent_op->bytenr,
2615 extent_op->num_bytes, 0);
2616 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2617
2618 spin_lock(&info->delalloc_lock);
2619 used = btrfs_super_bytes_used(&info->super_copy);
2620 btrfs_set_super_bytes_used(&info->super_copy,
2621 used - extent_op->num_bytes);
2622 used = btrfs_root_used(&extent_root->root_item);
2623 btrfs_set_root_used(&extent_root->root_item,
2624 used - extent_op->num_bytes);
2625 spin_unlock(&info->delalloc_lock);
2626
2627 ret = update_block_group(trans, extent_root,
2628 extent_op->bytenr,
2629 extent_op->num_bytes,
2630 0, ret > 0);
2631 BUG_ON(ret);
2632 kfree(extent_op);
2633 num_inserts--;
2634 }
2635 }
2636 mutex_unlock(&info->extent_ins_mutex);
2637
2638 ret = insert_extents(trans, extent_root, path, &insert_list,
2639 num_inserts);
2640 BUG_ON(ret);
2641
2642 /*
2643 * if restart is set for whatever reason we need to go back and start
2644 * searching through the pending list again.
2645 *
2646 * We just inserted some extents, which could have resulted in new
2647 * blocks being allocated, which would result in new blocks needing
2648 * updates, so if all is set we _must_ restart to get the updated
2649 * blocks.
2650 */
2651 if (restart || all) {
2652 INIT_LIST_HEAD(&insert_list);
2653 INIT_LIST_HEAD(&update_list);
2654 search = 0;
2655 restart = 0;
2656 num_inserts = 0;
2657 goto again;
2658 }
2659out:
2660 btrfs_free_path(path);
2661 return 0;
2662}
2663
2664static int pin_down_bytes(struct btrfs_trans_handle *trans, 2170static int pin_down_bytes(struct btrfs_trans_handle *trans,
2665 struct btrfs_root *root, 2171 struct btrfs_root *root,
2666 u64 bytenr, u64 num_bytes, int is_data) 2172 struct btrfs_path *path,
2173 u64 bytenr, u64 num_bytes, int is_data,
2174 struct extent_buffer **must_clean)
2667{ 2175{
2668 int err = 0; 2176 int err = 0;
2669 struct extent_buffer *buf; 2177 struct extent_buffer *buf;
@@ -2686,17 +2194,19 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2686 u64 header_transid = btrfs_header_generation(buf); 2194 u64 header_transid = btrfs_header_generation(buf);
2687 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 2195 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2688 header_owner != BTRFS_TREE_RELOC_OBJECTID && 2196 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2197 header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2689 header_transid == trans->transid && 2198 header_transid == trans->transid &&
2690 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 2199 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2691 clean_tree_block(NULL, root, buf); 2200 *must_clean = buf;
2692 btrfs_tree_unlock(buf);
2693 free_extent_buffer(buf);
2694 return 1; 2201 return 1;
2695 } 2202 }
2696 btrfs_tree_unlock(buf); 2203 btrfs_tree_unlock(buf);
2697 } 2204 }
2698 free_extent_buffer(buf); 2205 free_extent_buffer(buf);
2699pinit: 2206pinit:
2207 btrfs_set_path_blocking(path);
2208 mutex_lock(&root->fs_info->pinned_mutex);
2209 /* unlocks the pinned mutex */
2700 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2210 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2701 2211
2702 BUG_ON(err < 0); 2212 BUG_ON(err < 0);
@@ -2710,7 +2220,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2710 struct btrfs_root *root, 2220 struct btrfs_root *root,
2711 u64 bytenr, u64 num_bytes, u64 parent, 2221 u64 bytenr, u64 num_bytes, u64 parent,
2712 u64 root_objectid, u64 ref_generation, 2222 u64 root_objectid, u64 ref_generation,
2713 u64 owner_objectid, int pin, int mark_free) 2223 u64 owner_objectid, int pin, int mark_free,
2224 int refs_to_drop)
2714{ 2225{
2715 struct btrfs_path *path; 2226 struct btrfs_path *path;
2716 struct btrfs_key key; 2227 struct btrfs_key key;
@@ -2732,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2732 return -ENOMEM; 2243 return -ENOMEM;
2733 2244
2734 path->reada = 1; 2245 path->reada = 1;
2246 path->leave_spinning = 1;
2735 ret = lookup_extent_backref(trans, extent_root, path, 2247 ret = lookup_extent_backref(trans, extent_root, path,
2736 bytenr, parent, root_objectid, 2248 bytenr, parent, root_objectid,
2737 ref_generation, owner_objectid, 1); 2249 ref_generation, owner_objectid, 1);
@@ -2753,9 +2265,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2753 break; 2265 break;
2754 } 2266 }
2755 if (!found_extent) { 2267 if (!found_extent) {
2756 ret = remove_extent_backref(trans, extent_root, path); 2268 ret = remove_extent_backref(trans, extent_root, path,
2269 refs_to_drop);
2757 BUG_ON(ret); 2270 BUG_ON(ret);
2758 btrfs_release_path(extent_root, path); 2271 btrfs_release_path(extent_root, path);
2272 path->leave_spinning = 1;
2759 ret = btrfs_search_slot(trans, extent_root, 2273 ret = btrfs_search_slot(trans, extent_root,
2760 &key, path, -1, 1); 2274 &key, path, -1, 1);
2761 if (ret) { 2275 if (ret) {
@@ -2771,8 +2285,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2771 btrfs_print_leaf(extent_root, path->nodes[0]); 2285 btrfs_print_leaf(extent_root, path->nodes[0]);
2772 WARN_ON(1); 2286 WARN_ON(1);
2773 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 2287 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2774 "root %llu gen %llu owner %llu\n", 2288 "parent %llu root %llu gen %llu owner %llu\n",
2775 (unsigned long long)bytenr, 2289 (unsigned long long)bytenr,
2290 (unsigned long long)parent,
2776 (unsigned long long)root_objectid, 2291 (unsigned long long)root_objectid,
2777 (unsigned long long)ref_generation, 2292 (unsigned long long)ref_generation,
2778 (unsigned long long)owner_objectid); 2293 (unsigned long long)owner_objectid);
@@ -2782,17 +2297,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2782 ei = btrfs_item_ptr(leaf, extent_slot, 2297 ei = btrfs_item_ptr(leaf, extent_slot,
2783 struct btrfs_extent_item); 2298 struct btrfs_extent_item);
2784 refs = btrfs_extent_refs(leaf, ei); 2299 refs = btrfs_extent_refs(leaf, ei);
2785 BUG_ON(refs == 0);
2786 refs -= 1;
2787 btrfs_set_extent_refs(leaf, ei, refs);
2788 2300
2301 /*
2302 * we're not allowed to delete the extent item if there
2303 * are other delayed ref updates pending
2304 */
2305
2306 BUG_ON(refs < refs_to_drop);
2307 refs -= refs_to_drop;
2308 btrfs_set_extent_refs(leaf, ei, refs);
2789 btrfs_mark_buffer_dirty(leaf); 2309 btrfs_mark_buffer_dirty(leaf);
2790 2310
2791 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { 2311 if (refs == 0 && found_extent &&
2312 path->slots[0] == extent_slot + 1) {
2792 struct btrfs_extent_ref *ref; 2313 struct btrfs_extent_ref *ref;
2793 ref = btrfs_item_ptr(leaf, path->slots[0], 2314 ref = btrfs_item_ptr(leaf, path->slots[0],
2794 struct btrfs_extent_ref); 2315 struct btrfs_extent_ref);
2795 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); 2316 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
2796 /* if the back ref and the extent are next to each other 2317 /* if the back ref and the extent are next to each other
2797 * they get deleted below in one shot 2318 * they get deleted below in one shot
2798 */ 2319 */
@@ -2800,11 +2321,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2800 num_to_del = 2; 2321 num_to_del = 2;
2801 } else if (found_extent) { 2322 } else if (found_extent) {
2802 /* otherwise delete the extent back ref */ 2323 /* otherwise delete the extent back ref */
2803 ret = remove_extent_backref(trans, extent_root, path); 2324 ret = remove_extent_backref(trans, extent_root, path,
2325 refs_to_drop);
2804 BUG_ON(ret); 2326 BUG_ON(ret);
2805 /* if refs are 0, we need to setup the path for deletion */ 2327 /* if refs are 0, we need to setup the path for deletion */
2806 if (refs == 0) { 2328 if (refs == 0) {
2807 btrfs_release_path(extent_root, path); 2329 btrfs_release_path(extent_root, path);
2330 path->leave_spinning = 1;
2808 ret = btrfs_search_slot(trans, extent_root, &key, path, 2331 ret = btrfs_search_slot(trans, extent_root, &key, path,
2809 -1, 1); 2332 -1, 1);
2810 BUG_ON(ret); 2333 BUG_ON(ret);
@@ -2814,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2814 if (refs == 0) { 2337 if (refs == 0) {
2815 u64 super_used; 2338 u64 super_used;
2816 u64 root_used; 2339 u64 root_used;
2340 struct extent_buffer *must_clean = NULL;
2817 2341
2818 if (pin) { 2342 if (pin) {
2819 mutex_lock(&root->fs_info->pinned_mutex); 2343 ret = pin_down_bytes(trans, root, path,
2820 ret = pin_down_bytes(trans, root, bytenr, num_bytes, 2344 bytenr, num_bytes,
2821 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); 2345 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
2822 mutex_unlock(&root->fs_info->pinned_mutex); 2346 &must_clean);
2823 if (ret > 0) 2347 if (ret > 0)
2824 mark_free = 1; 2348 mark_free = 1;
2825 BUG_ON(ret < 0); 2349 BUG_ON(ret < 0);
2826 } 2350 }
2351
2827 /* block accounting for super block */ 2352 /* block accounting for super block */
2828 spin_lock(&info->delalloc_lock); 2353 spin_lock(&info->delalloc_lock);
2829 super_used = btrfs_super_bytes_used(&info->super_copy); 2354 super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2360,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2835 btrfs_set_root_used(&root->root_item, 2360 btrfs_set_root_used(&root->root_item,
2836 root_used - num_bytes); 2361 root_used - num_bytes);
2837 spin_unlock(&info->delalloc_lock); 2362 spin_unlock(&info->delalloc_lock);
2363
2364 /*
2365 * it is going to be very rare for someone to be waiting
2366 * on the block we're freeing. del_items might need to
2367 * schedule, so rather than get fancy, just force it
2368 * to blocking here
2369 */
2370 if (must_clean)
2371 btrfs_set_lock_blocking(must_clean);
2372
2838 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 2373 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2839 num_to_del); 2374 num_to_del);
2840 BUG_ON(ret); 2375 BUG_ON(ret);
2841 btrfs_release_path(extent_root, path); 2376 btrfs_release_path(extent_root, path);
2842 2377
2378 if (must_clean) {
2379 clean_tree_block(NULL, root, must_clean);
2380 btrfs_tree_unlock(must_clean);
2381 free_extent_buffer(must_clean);
2382 }
2383
2843 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 2384 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2844 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 2385 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2845 BUG_ON(ret); 2386 BUG_ON(ret);
2387 } else {
2388 invalidate_mapping_pages(info->btree_inode->i_mapping,
2389 bytenr >> PAGE_CACHE_SHIFT,
2390 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
2846 } 2391 }
2847 2392
2848 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 2393 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2395,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2850 BUG_ON(ret); 2395 BUG_ON(ret);
2851 } 2396 }
2852 btrfs_free_path(path); 2397 btrfs_free_path(path);
2853 finish_current_insert(trans, extent_root, 0);
2854 return ret; 2398 return ret;
2855} 2399}
2856 2400
2857/* 2401/*
2858 * find all the blocks marked as pending in the radix tree and remove 2402 * remove an extent from the root, returns 0 on success
2859 * them from the extent map
2860 */ 2403 */
2861static int del_pending_extents(struct btrfs_trans_handle *trans, 2404static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2862 struct btrfs_root *extent_root, int all) 2405 struct btrfs_root *root,
2406 u64 bytenr, u64 num_bytes, u64 parent,
2407 u64 root_objectid, u64 ref_generation,
2408 u64 owner_objectid, int pin,
2409 int refs_to_drop)
2863{ 2410{
2864 int ret; 2411 WARN_ON(num_bytes < root->sectorsize);
2865 int err = 0;
2866 u64 start;
2867 u64 end;
2868 u64 priv;
2869 u64 search = 0;
2870 int nr = 0, skipped = 0;
2871 struct extent_io_tree *pending_del;
2872 struct extent_io_tree *extent_ins;
2873 struct pending_extent_op *extent_op;
2874 struct btrfs_fs_info *info = extent_root->fs_info;
2875 struct list_head delete_list;
2876
2877 INIT_LIST_HEAD(&delete_list);
2878 extent_ins = &extent_root->fs_info->extent_ins;
2879 pending_del = &extent_root->fs_info->pending_del;
2880
2881again:
2882 mutex_lock(&info->extent_ins_mutex);
2883 while (1) {
2884 ret = find_first_extent_bit(pending_del, search, &start, &end,
2885 EXTENT_WRITEBACK);
2886 if (ret) {
2887 if (all && skipped && !nr) {
2888 search = 0;
2889 skipped = 0;
2890 continue;
2891 }
2892 mutex_unlock(&info->extent_ins_mutex);
2893 break;
2894 }
2895
2896 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2897 if (!ret) {
2898 search = end+1;
2899 skipped = 1;
2900
2901 if (need_resched()) {
2902 mutex_unlock(&info->extent_ins_mutex);
2903 cond_resched();
2904 mutex_lock(&info->extent_ins_mutex);
2905 }
2906
2907 continue;
2908 }
2909 BUG_ON(ret < 0);
2910
2911 ret = get_state_private(pending_del, start, &priv);
2912 BUG_ON(ret);
2913 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2914
2915 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2916 GFP_NOFS);
2917 if (!test_range_bit(extent_ins, start, end,
2918 EXTENT_WRITEBACK, 0)) {
2919 list_add_tail(&extent_op->list, &delete_list);
2920 nr++;
2921 } else {
2922 kfree(extent_op);
2923
2924 ret = get_state_private(&info->extent_ins, start,
2925 &priv);
2926 BUG_ON(ret);
2927 extent_op = (struct pending_extent_op *)
2928 (unsigned long)priv;
2929
2930 clear_extent_bits(&info->extent_ins, start, end,
2931 EXTENT_WRITEBACK, GFP_NOFS);
2932
2933 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2934 list_add_tail(&extent_op->list, &delete_list);
2935 search = end + 1;
2936 nr++;
2937 continue;
2938 }
2939
2940 mutex_lock(&extent_root->fs_info->pinned_mutex);
2941 ret = pin_down_bytes(trans, extent_root, start,
2942 end + 1 - start, 0);
2943 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2944
2945 ret = update_block_group(trans, extent_root, start,
2946 end + 1 - start, 0, ret > 0);
2947
2948 unlock_extent(extent_ins, start, end, GFP_NOFS);
2949 BUG_ON(ret);
2950 kfree(extent_op);
2951 }
2952 if (ret)
2953 err = ret;
2954
2955 search = end + 1;
2956
2957 if (need_resched()) {
2958 mutex_unlock(&info->extent_ins_mutex);
2959 cond_resched();
2960 mutex_lock(&info->extent_ins_mutex);
2961 }
2962 }
2963 2412
2964 if (nr) { 2413 /*
2965 ret = free_extents(trans, extent_root, &delete_list); 2414 * if metadata always pin
2966 BUG_ON(ret); 2415 * if data pin when any transaction has committed this
2967 } 2416 */
2417 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2418 ref_generation != trans->transid)
2419 pin = 1;
2968 2420
2969 if (all && skipped) { 2421 if (ref_generation != trans->transid)
2970 INIT_LIST_HEAD(&delete_list); 2422 pin = 1;
2971 search = 0;
2972 nr = 0;
2973 goto again;
2974 }
2975 2423
2976 if (!err) 2424 return __free_extent(trans, root, bytenr, num_bytes, parent,
2977 finish_current_insert(trans, extent_root, 0); 2425 root_objectid, ref_generation,
2978 return err; 2426 owner_objectid, pin, pin == 0, refs_to_drop);
2979} 2427}
2980 2428
2981/* 2429/*
2982 * remove an extent from the root, returns 0 on success 2430 * when we free an extent, it is possible (and likely) that we free the last
2431 * delayed ref for that extent as well. This searches the delayed ref tree for
2432 * a given extent, and if there are no other delayed refs to be processed, it
2433 * removes it from the tree.
2983 */ 2434 */
2984static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 2435static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2985 struct btrfs_root *root, 2436 struct btrfs_root *root, u64 bytenr)
2986 u64 bytenr, u64 num_bytes, u64 parent,
2987 u64 root_objectid, u64 ref_generation,
2988 u64 owner_objectid, int pin)
2989{ 2437{
2990 struct btrfs_root *extent_root = root->fs_info->extent_root; 2438 struct btrfs_delayed_ref_head *head;
2991 int pending_ret; 2439 struct btrfs_delayed_ref_root *delayed_refs;
2440 struct btrfs_delayed_ref_node *ref;
2441 struct rb_node *node;
2992 int ret; 2442 int ret;
2993 2443
2994 WARN_ON(num_bytes < root->sectorsize); 2444 delayed_refs = &trans->transaction->delayed_refs;
2995 if (root == extent_root) { 2445 spin_lock(&delayed_refs->lock);
2996 struct pending_extent_op *extent_op = NULL; 2446 head = btrfs_find_delayed_ref_head(trans, bytenr);
2997 2447 if (!head)
2998 mutex_lock(&root->fs_info->extent_ins_mutex); 2448 goto out;
2999 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
3000 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
3001 u64 priv;
3002 ret = get_state_private(&root->fs_info->extent_ins,
3003 bytenr, &priv);
3004 BUG_ON(ret);
3005 extent_op = (struct pending_extent_op *)
3006 (unsigned long)priv;
3007 2449
3008 extent_op->del = 1; 2450 node = rb_prev(&head->node.rb_node);
3009 if (extent_op->type == PENDING_EXTENT_INSERT) { 2451 if (!node)
3010 mutex_unlock(&root->fs_info->extent_ins_mutex); 2452 goto out;
3011 return 0;
3012 }
3013 }
3014 2453
3015 if (extent_op) { 2454 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3016 ref_generation = extent_op->orig_generation;
3017 parent = extent_op->orig_parent;
3018 }
3019 2455
3020 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2456 /* there are still entries for this ref, we can't drop it */
3021 BUG_ON(!extent_op); 2457 if (ref->bytenr == bytenr)
3022 2458 goto out;
3023 extent_op->type = PENDING_EXTENT_DELETE;
3024 extent_op->bytenr = bytenr;
3025 extent_op->num_bytes = num_bytes;
3026 extent_op->parent = parent;
3027 extent_op->orig_parent = parent;
3028 extent_op->generation = ref_generation;
3029 extent_op->orig_generation = ref_generation;
3030 extent_op->level = (int)owner_objectid;
3031 INIT_LIST_HEAD(&extent_op->list);
3032 extent_op->del = 0;
3033
3034 set_extent_bits(&root->fs_info->pending_del,
3035 bytenr, bytenr + num_bytes - 1,
3036 EXTENT_WRITEBACK, GFP_NOFS);
3037 set_state_private(&root->fs_info->pending_del,
3038 bytenr, (unsigned long)extent_op);
3039 mutex_unlock(&root->fs_info->extent_ins_mutex);
3040 return 0;
3041 }
3042 /* if metadata always pin */
3043 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3044 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3045 mutex_lock(&root->fs_info->pinned_mutex);
3046 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3047 mutex_unlock(&root->fs_info->pinned_mutex);
3048 update_reserved_extents(root, bytenr, num_bytes, 0);
3049 return 0;
3050 }
3051 pin = 1;
3052 }
3053 2459
3054 /* if data pin when any transaction has committed this */ 2460 /*
3055 if (ref_generation != trans->transid) 2461 * waiting for the lock here would deadlock. If someone else has it
3056 pin = 1; 2462 * locked they are already in the process of dropping it anyway
2463 */
2464 if (!mutex_trylock(&head->mutex))
2465 goto out;
3057 2466
3058 ret = __free_extent(trans, root, bytenr, num_bytes, parent, 2467 /*
3059 root_objectid, ref_generation, 2468 * at this point we have a head with no other entries. Go
3060 owner_objectid, pin, pin == 0); 2469 * ahead and process it.
2470 */
2471 head->node.in_tree = 0;
2472 rb_erase(&head->node.rb_node, &delayed_refs->root);
3061 2473
3062 finish_current_insert(trans, root->fs_info->extent_root, 0); 2474 delayed_refs->num_entries--;
3063 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); 2475
3064 return ret ? ret : pending_ret; 2476 /*
2477 * we don't take a ref on the node because we're removing it from the
2478 * tree, so we just steal the ref the tree was holding.
2479 */
2480 delayed_refs->num_heads--;
2481 if (list_empty(&head->cluster))
2482 delayed_refs->num_heads_ready--;
2483
2484 list_del_init(&head->cluster);
2485 spin_unlock(&delayed_refs->lock);
2486
2487 ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
2488 &head->node, head->must_insert_reserved);
2489 BUG_ON(ret);
2490 btrfs_put_delayed_ref(&head->node);
2491 return 0;
2492out:
2493 spin_unlock(&delayed_refs->lock);
2494 return 0;
3065} 2495}
3066 2496
3067int btrfs_free_extent(struct btrfs_trans_handle *trans, 2497int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2502,30 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3072{ 2502{
3073 int ret; 2503 int ret;
3074 2504
3075 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, 2505 /*
3076 root_objectid, ref_generation, 2506 * tree log blocks never actually go into the extent allocation
3077 owner_objectid, pin); 2507 * tree, just update pinning info and exit early.
2508 *
2509 * data extents referenced by the tree log do need to have
2510 * their reference counts bumped.
2511 */
2512 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2513 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2514 mutex_lock(&root->fs_info->pinned_mutex);
2515
2516 /* unlocks the pinned mutex */
2517 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2518 update_reserved_extents(root, bytenr, num_bytes, 0);
2519 ret = 0;
2520 } else {
2521 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
2522 root_objectid, ref_generation,
2523 owner_objectid,
2524 BTRFS_DROP_DELAYED_REF, 1);
2525 BUG_ON(ret);
2526 ret = check_ref_cleanup(trans, root, bytenr);
2527 BUG_ON(ret);
2528 }
3078 return ret; 2529 return ret;
3079} 2530}
3080 2531
@@ -3475,10 +2926,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3475static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 2926static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3476 struct btrfs_root *root, u64 parent, 2927 struct btrfs_root *root, u64 parent,
3477 u64 root_objectid, u64 ref_generation, 2928 u64 root_objectid, u64 ref_generation,
3478 u64 owner, struct btrfs_key *ins) 2929 u64 owner, struct btrfs_key *ins,
2930 int ref_mod)
3479{ 2931{
3480 int ret; 2932 int ret;
3481 int pending_ret;
3482 u64 super_used; 2933 u64 super_used;
3483 u64 root_used; 2934 u64 root_used;
3484 u64 num_bytes = ins->offset; 2935 u64 num_bytes = ins->offset;
@@ -3503,33 +2954,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3503 btrfs_set_root_used(&root->root_item, root_used + num_bytes); 2954 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3504 spin_unlock(&info->delalloc_lock); 2955 spin_unlock(&info->delalloc_lock);
3505 2956
3506 if (root == extent_root) {
3507 struct pending_extent_op *extent_op;
3508
3509 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3510 BUG_ON(!extent_op);
3511
3512 extent_op->type = PENDING_EXTENT_INSERT;
3513 extent_op->bytenr = ins->objectid;
3514 extent_op->num_bytes = ins->offset;
3515 extent_op->parent = parent;
3516 extent_op->orig_parent = 0;
3517 extent_op->generation = ref_generation;
3518 extent_op->orig_generation = 0;
3519 extent_op->level = (int)owner;
3520 INIT_LIST_HEAD(&extent_op->list);
3521 extent_op->del = 0;
3522
3523 mutex_lock(&root->fs_info->extent_ins_mutex);
3524 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3525 ins->objectid + ins->offset - 1,
3526 EXTENT_WRITEBACK, GFP_NOFS);
3527 set_state_private(&root->fs_info->extent_ins,
3528 ins->objectid, (unsigned long)extent_op);
3529 mutex_unlock(&root->fs_info->extent_ins_mutex);
3530 goto update_block;
3531 }
3532
3533 memcpy(&keys[0], ins, sizeof(*ins)); 2957 memcpy(&keys[0], ins, sizeof(*ins));
3534 keys[1].objectid = ins->objectid; 2958 keys[1].objectid = ins->objectid;
3535 keys[1].type = BTRFS_EXTENT_REF_KEY; 2959 keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +2964,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3540 path = btrfs_alloc_path(); 2964 path = btrfs_alloc_path();
3541 BUG_ON(!path); 2965 BUG_ON(!path);
3542 2966
2967 path->leave_spinning = 1;
3543 ret = btrfs_insert_empty_items(trans, extent_root, path, keys, 2968 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3544 sizes, 2); 2969 sizes, 2);
3545 BUG_ON(ret); 2970 BUG_ON(ret);
3546 2971
3547 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2972 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3548 struct btrfs_extent_item); 2973 struct btrfs_extent_item);
3549 btrfs_set_extent_refs(path->nodes[0], extent_item, 1); 2974 btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
3550 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 2975 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3551 struct btrfs_extent_ref); 2976 struct btrfs_extent_ref);
3552 2977
3553 btrfs_set_ref_root(path->nodes[0], ref, root_objectid); 2978 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3554 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); 2979 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3555 btrfs_set_ref_objectid(path->nodes[0], ref, owner); 2980 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3556 btrfs_set_ref_num_refs(path->nodes[0], ref, 1); 2981 btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
3557 2982
3558 btrfs_mark_buffer_dirty(path->nodes[0]); 2983 btrfs_mark_buffer_dirty(path->nodes[0]);
3559 2984
3560 trans->alloc_exclude_start = 0; 2985 trans->alloc_exclude_start = 0;
3561 trans->alloc_exclude_nr = 0; 2986 trans->alloc_exclude_nr = 0;
3562 btrfs_free_path(path); 2987 btrfs_free_path(path);
3563 finish_current_insert(trans, extent_root, 0);
3564 pending_ret = del_pending_extents(trans, extent_root, 0);
3565 2988
3566 if (ret) 2989 if (ret)
3567 goto out; 2990 goto out;
3568 if (pending_ret) {
3569 ret = pending_ret;
3570 goto out;
3571 }
3572 2991
3573update_block:
3574 ret = update_block_group(trans, root, ins->objectid, 2992 ret = update_block_group(trans, root, ins->objectid,
3575 ins->offset, 1, 0); 2993 ins->offset, 1, 0);
3576 if (ret) { 2994 if (ret) {
@@ -3592,9 +3010,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3592 3010
3593 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) 3011 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3594 return 0; 3012 return 0;
3595 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3013
3596 ref_generation, owner, ins); 3014 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3597 update_reserved_extents(root, ins->objectid, ins->offset, 0); 3015 ins->offset, parent, root_objectid,
3016 ref_generation, owner,
3017 BTRFS_ADD_DELAYED_EXTENT, 0);
3018 BUG_ON(ret);
3598 return ret; 3019 return ret;
3599} 3020}
3600 3021
@@ -3621,7 +3042,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3621 BUG_ON(ret); 3042 BUG_ON(ret);
3622 put_block_group(block_group); 3043 put_block_group(block_group);
3623 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3044 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3624 ref_generation, owner, ins); 3045 ref_generation, owner, ins, 1);
3625 return ret; 3046 return ret;
3626} 3047}
3627 3048
@@ -3640,20 +3061,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3640 u64 search_end, struct btrfs_key *ins, u64 data) 3061 u64 search_end, struct btrfs_key *ins, u64 data)
3641{ 3062{
3642 int ret; 3063 int ret;
3643
3644 ret = __btrfs_reserve_extent(trans, root, num_bytes, 3064 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3645 min_alloc_size, empty_size, hint_byte, 3065 min_alloc_size, empty_size, hint_byte,
3646 search_end, ins, data); 3066 search_end, ins, data);
3647 BUG_ON(ret); 3067 BUG_ON(ret);
3648 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 3068 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3649 ret = __btrfs_alloc_reserved_extent(trans, root, parent, 3069 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3650 root_objectid, ref_generation, 3070 ins->offset, parent, root_objectid,
3651 owner_objectid, ins); 3071 ref_generation, owner_objectid,
3072 BTRFS_ADD_DELAYED_EXTENT, 0);
3652 BUG_ON(ret); 3073 BUG_ON(ret);
3653
3654 } else {
3655 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3656 } 3074 }
3075 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3657 return ret; 3076 return ret;
3658} 3077}
3659 3078
@@ -3789,7 +3208,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3789 3208
3790 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3209 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3791 3210
3792 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3211 ret = btrfs_free_extent(trans, root, disk_bytenr,
3793 btrfs_file_extent_disk_num_bytes(leaf, fi), 3212 btrfs_file_extent_disk_num_bytes(leaf, fi),
3794 leaf->start, leaf_owner, leaf_generation, 3213 leaf->start, leaf_owner, leaf_generation,
3795 key.objectid, 0); 3214 key.objectid, 0);
@@ -3829,7 +3248,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3829 */ 3248 */
3830 for (i = 0; i < ref->nritems; i++) { 3249 for (i = 0; i < ref->nritems; i++) {
3831 info = ref->extents + sorted[i].slot; 3250 info = ref->extents + sorted[i].slot;
3832 ret = __btrfs_free_extent(trans, root, info->bytenr, 3251 ret = btrfs_free_extent(trans, root, info->bytenr,
3833 info->num_bytes, ref->bytenr, 3252 info->num_bytes, ref->bytenr,
3834 ref->owner, ref->generation, 3253 ref->owner, ref->generation,
3835 info->objectid, 0); 3254 info->objectid, 0);
@@ -3846,12 +3265,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3846 return 0; 3265 return 0;
3847} 3266}
3848 3267
3849static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, 3268static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3269 struct btrfs_root *root, u64 start,
3850 u64 len, u32 *refs) 3270 u64 len, u32 *refs)
3851{ 3271{
3852 int ret; 3272 int ret;
3853 3273
3854 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); 3274 ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
3855 BUG_ON(ret); 3275 BUG_ON(ret);
3856 3276
3857#if 0 /* some debugging code in case we see problems here */ 3277#if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3379,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3959 * we just decrement it below and don't update any 3379 * we just decrement it below and don't update any
3960 * of the refs the leaf points to. 3380 * of the refs the leaf points to.
3961 */ 3381 */
3962 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3382 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3383 blocksize, &refs);
3963 BUG_ON(ret); 3384 BUG_ON(ret);
3964 if (refs != 1) 3385 if (refs != 1)
3965 continue; 3386 continue;
@@ -4010,7 +3431,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
4010 */ 3431 */
4011 for (i = 0; i < refi; i++) { 3432 for (i = 0; i < refi; i++) {
4012 bytenr = sorted[i].bytenr; 3433 bytenr = sorted[i].bytenr;
4013 ret = __btrfs_free_extent(trans, root, bytenr, 3434 ret = btrfs_free_extent(trans, root, bytenr,
4014 blocksize, eb->start, 3435 blocksize, eb->start,
4015 root_owner, root_gen, 0, 1); 3436 root_owner, root_gen, 0, 1);
4016 BUG_ON(ret); 3437 BUG_ON(ret);
@@ -4053,7 +3474,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4053 3474
4054 WARN_ON(*level < 0); 3475 WARN_ON(*level < 0);
4055 WARN_ON(*level >= BTRFS_MAX_LEVEL); 3476 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4056 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, 3477 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4057 path->nodes[*level]->len, &refs); 3478 path->nodes[*level]->len, &refs);
4058 BUG_ON(ret); 3479 BUG_ON(ret);
4059 if (refs > 1) 3480 if (refs > 1)
@@ -4104,7 +3525,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4104 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3525 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4105 blocksize = btrfs_level_size(root, *level - 1); 3526 blocksize = btrfs_level_size(root, *level - 1);
4106 3527
4107 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3528 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3529 blocksize, &refs);
4108 BUG_ON(ret); 3530 BUG_ON(ret);
4109 3531
4110 /* 3532 /*
@@ -4119,7 +3541,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4119 root_gen = btrfs_header_generation(parent); 3541 root_gen = btrfs_header_generation(parent);
4120 path->slots[*level]++; 3542 path->slots[*level]++;
4121 3543
4122 ret = __btrfs_free_extent(trans, root, bytenr, 3544 ret = btrfs_free_extent(trans, root, bytenr,
4123 blocksize, parent->start, 3545 blocksize, parent->start,
4124 root_owner, root_gen, 3546 root_owner, root_gen,
4125 *level - 1, 1); 3547 *level - 1, 1);
@@ -4165,7 +3587,7 @@ out:
4165 * cleanup and free the reference on the last node 3587 * cleanup and free the reference on the last node
4166 * we processed 3588 * we processed
4167 */ 3589 */
4168 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3590 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4169 parent->start, root_owner, root_gen, 3591 parent->start, root_owner, root_gen,
4170 *level, 1); 3592 *level, 1);
4171 free_extent_buffer(path->nodes[*level]); 3593 free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3776,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4354 struct btrfs_path *path; 3776 struct btrfs_path *path;
4355 int i; 3777 int i;
4356 int orig_level; 3778 int orig_level;
3779 int update_count;
4357 struct btrfs_root_item *root_item = &root->root_item; 3780 struct btrfs_root_item *root_item = &root->root_item;
4358 3781
4359 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); 3782 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3818,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4395 } 3818 }
4396 } 3819 }
4397 while (1) { 3820 while (1) {
3821 unsigned long update;
4398 wret = walk_down_tree(trans, root, path, &level); 3822 wret = walk_down_tree(trans, root, path, &level);
4399 if (wret > 0) 3823 if (wret > 0)
4400 break; 3824 break;
@@ -4407,12 +3831,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4407 break; 3831 break;
4408 if (wret < 0) 3832 if (wret < 0)
4409 ret = wret; 3833 ret = wret;
4410 if (trans->transaction->in_commit) { 3834 if (trans->transaction->in_commit ||
3835 trans->transaction->delayed_refs.flushing) {
4411 ret = -EAGAIN; 3836 ret = -EAGAIN;
4412 break; 3837 break;
4413 } 3838 }
4414 atomic_inc(&root->fs_info->throttle_gen); 3839 atomic_inc(&root->fs_info->throttle_gen);
4415 wake_up(&root->fs_info->transaction_throttle); 3840 wake_up(&root->fs_info->transaction_throttle);
3841 for (update_count = 0; update_count < 16; update_count++) {
3842 update = trans->delayed_ref_updates;
3843 trans->delayed_ref_updates = 0;
3844 if (update)
3845 btrfs_run_delayed_refs(trans, root, update);
3846 else
3847 break;
3848 }
4416 } 3849 }
4417 for (i = 0; i <= orig_level; i++) { 3850 for (i = 0; i <= orig_level; i++) {
4418 if (path->nodes[i]) { 3851 if (path->nodes[i]) {
@@ -5457,6 +4890,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
5457 root->root_key.objectid, 4890 root->root_key.objectid,
5458 trans->transid, key.objectid); 4891 trans->transid, key.objectid);
5459 BUG_ON(ret); 4892 BUG_ON(ret);
4893
5460 ret = btrfs_free_extent(trans, root, 4894 ret = btrfs_free_extent(trans, root,
5461 bytenr, num_bytes, leaf->start, 4895 bytenr, num_bytes, leaf->start,
5462 btrfs_header_owner(leaf), 4896 btrfs_header_owner(leaf),
@@ -5768,9 +5202,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5768 ref_path, NULL, NULL); 5202 ref_path, NULL, NULL);
5769 BUG_ON(ret); 5203 BUG_ON(ret);
5770 5204
5771 if (root == root->fs_info->extent_root)
5772 btrfs_extent_post_op(trans, root);
5773
5774 return 0; 5205 return 0;
5775} 5206}
5776 5207
@@ -6038,6 +5469,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
6038 if (!path) 5469 if (!path)
6039 return -ENOMEM; 5470 return -ENOMEM;
6040 5471
5472 path->leave_spinning = 1;
6041 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 5473 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6042 if (ret) 5474 if (ret)
6043 goto out; 5475 goto out;
@@ -6208,6 +5640,9 @@ again:
6208 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 5640 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
6209 mutex_unlock(&root->fs_info->cleaner_mutex); 5641 mutex_unlock(&root->fs_info->cleaner_mutex);
6210 5642
5643 trans = btrfs_start_transaction(info->tree_root, 1);
5644 btrfs_commit_transaction(trans, info->tree_root);
5645
6211 while (1) { 5646 while (1) {
6212 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5647 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6213 if (ret < 0) 5648 if (ret < 0)
@@ -6466,7 +5901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6466 5901
6467 extent_root = root->fs_info->extent_root; 5902 extent_root = root->fs_info->extent_root;
6468 5903
6469 root->fs_info->last_trans_new_blockgroup = trans->transid; 5904 root->fs_info->last_trans_log_full_commit = trans->transid;
6470 5905
6471 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5906 cache = kzalloc(sizeof(*cache), GFP_NOFS);
6472 if (!cache) 5907 if (!cache)
@@ -6500,9 +5935,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6500 sizeof(cache->item)); 5935 sizeof(cache->item));
6501 BUG_ON(ret); 5936 BUG_ON(ret);
6502 5937
6503 finish_current_insert(trans, extent_root, 0);
6504 ret = del_pending_extents(trans, extent_root, 0);
6505 BUG_ON(ret);
6506 set_avail_alloc_bits(extent_root->fs_info, type); 5938 set_avail_alloc_bits(extent_root->fs_info, type);
6507 5939
6508 return 0; 5940 return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e606..08085af089e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3124,20 +3124,15 @@ void free_extent_buffer(struct extent_buffer *eb)
3124int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3124int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3125 struct extent_buffer *eb) 3125 struct extent_buffer *eb)
3126{ 3126{
3127 int set;
3128 unsigned long i; 3127 unsigned long i;
3129 unsigned long num_pages; 3128 unsigned long num_pages;
3130 struct page *page; 3129 struct page *page;
3131 3130
3132 u64 start = eb->start;
3133 u64 end = start + eb->len - 1;
3134
3135 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3136 num_pages = num_extent_pages(eb->start, eb->len); 3131 num_pages = num_extent_pages(eb->start, eb->len);
3137 3132
3138 for (i = 0; i < num_pages; i++) { 3133 for (i = 0; i < num_pages; i++) {
3139 page = extent_buffer_page(eb, i); 3134 page = extent_buffer_page(eb, i);
3140 if (!set && !PageDirty(page)) 3135 if (!PageDirty(page))
3141 continue; 3136 continue;
3142 3137
3143 lock_page(page); 3138 lock_page(page);
@@ -3146,22 +3141,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3146 else 3141 else
3147 set_page_private(page, EXTENT_PAGE_PRIVATE); 3142 set_page_private(page, EXTENT_PAGE_PRIVATE);
3148 3143
3149 /*
3150 * if we're on the last page or the first page and the
3151 * block isn't aligned on a page boundary, do extra checks
3152 * to make sure we don't clean page that is partially dirty
3153 */
3154 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3155 ((i == num_pages - 1) &&
3156 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3157 start = (u64)page->index << PAGE_CACHE_SHIFT;
3158 end = start + PAGE_CACHE_SIZE - 1;
3159 if (test_range_bit(tree, start, end,
3160 EXTENT_DIRTY, 0)) {
3161 unlock_page(page);
3162 continue;
3163 }
3164 }
3165 clear_page_dirty_for_io(page); 3144 clear_page_dirty_for_io(page);
3166 spin_lock_irq(&page->mapping->tree_lock); 3145 spin_lock_irq(&page->mapping->tree_lock);
3167 if (!PageDirty(page)) { 3146 if (!PageDirty(page)) {
@@ -3187,29 +3166,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3187{ 3166{
3188 unsigned long i; 3167 unsigned long i;
3189 unsigned long num_pages; 3168 unsigned long num_pages;
3169 int was_dirty = 0;
3190 3170
3171 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3191 num_pages = num_extent_pages(eb->start, eb->len); 3172 num_pages = num_extent_pages(eb->start, eb->len);
3192 for (i = 0; i < num_pages; i++) { 3173 for (i = 0; i < num_pages; i++)
3193 struct page *page = extent_buffer_page(eb, i);
3194 /* writepage may need to do something special for the
3195 * first page, we have to make sure page->private is
3196 * properly set. releasepage may drop page->private
3197 * on us if the page isn't already dirty.
3198 */
3199 lock_page(page);
3200 if (i == 0) {
3201 set_page_extent_head(page, eb->len);
3202 } else if (PagePrivate(page) &&
3203 page->private != EXTENT_PAGE_PRIVATE) {
3204 set_page_extent_mapped(page);
3205 }
3206 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3174 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3207 set_extent_dirty(tree, page_offset(page), 3175 return was_dirty;
3208 page_offset(page) + PAGE_CACHE_SIZE - 1,
3209 GFP_NOFS);
3210 unlock_page(page);
3211 }
3212 return 0;
3213} 3176}
3214 3177
3215int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3178int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3752,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3789 ret = 0; 3752 ret = 0;
3790 goto out; 3753 goto out;
3791 } 3754 }
3755 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3756 ret = 0;
3757 goto out;
3758 }
3792 /* at this point we can safely release the extent buffer */ 3759 /* at this point we can safely release the extent buffer */
3793 num_pages = num_extent_pages(eb->start, eb->len); 3760 num_pages = num_extent_pages(eb->start, eb->len);
3794 for (i = 0; i < num_pages; i++) 3761 for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf..5bc20abf3f3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
25/* these are bit numbers for test/set bit */ 25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0 26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1 27#define EXTENT_BUFFER_BLOCKING 1
28#define EXTENT_BUFFER_DIRTY 2
28 29
29/* 30/*
30 * page->private values. Every page that is controlled by the extent 31 * page->private values. Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
254 struct extent_buffer *eb); 255 struct extent_buffer *eb);
255int set_extent_buffer_dirty(struct extent_io_tree *tree, 256int set_extent_buffer_dirty(struct extent_io_tree *tree,
256 struct extent_buffer *eb); 257 struct extent_buffer *eb);
258int test_extent_buffer_dirty(struct extent_io_tree *tree,
259 struct extent_buffer *eb);
257int set_extent_buffer_uptodate(struct extent_io_tree *tree, 260int set_extent_buffer_uptodate(struct extent_io_tree *tree,
258 struct extent_buffer *eb); 261 struct extent_buffer *eb);
259int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 262int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd..9b99886562d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
52 file_key.offset = pos; 52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54 54
55 path->leave_spinning = 1;
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 56 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item)); 57 sizeof(*item));
57 if (ret < 0) 58 if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
523 key.offset = end_byte - 1; 524 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY; 525 key.type = BTRFS_EXTENT_CSUM_KEY;
525 526
527 path->leave_spinning = 1;
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 528 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) { 529 if (ret > 0) {
528 if (path->slots[0] == 0) 530 if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
757 } else { 759 } else {
758 ins_size = csum_size; 760 ins_size = csum_size;
759 } 761 }
762 path->leave_spinning = 1;
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 763 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size); 764 ins_size);
765 path->leave_spinning = 0;
762 if (ret < 0) 766 if (ret < 0)
763 goto fail_unlock; 767 goto fail_unlock;
764 if (ret != 0) { 768 if (ret != 0) {
@@ -776,7 +780,6 @@ found:
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 780 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0])); 781 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL; 782 eb_token = NULL;
779 cond_resched();
780next_sector: 783next_sector:
781 784
782 if (!eb_token || 785 if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
817 eb_token = NULL; 820 eb_token = NULL;
818 } 821 }
819 btrfs_mark_buffer_dirty(path->nodes[0]); 822 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) { 823 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path); 824 btrfs_release_path(root, path);
825 cond_resched();
823 goto again; 826 goto again;
824 } 827 }
825out: 828out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b..9c9fb46ccd0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); 606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
607 607
608 btrfs_release_path(root, path); 608 btrfs_release_path(root, path);
609 path->leave_spinning = 1;
609 ret = btrfs_insert_empty_item(trans, root, path, &ins, 610 ret = btrfs_insert_empty_item(trans, root, path, &ins,
610 sizeof(*extent)); 611 sizeof(*extent));
611 BUG_ON(ret); 612 BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
639 ram_bytes); 640 ram_bytes);
640 btrfs_set_file_extent_type(leaf, extent, found_type); 641 btrfs_set_file_extent_type(leaf, extent, found_type);
641 642
643 btrfs_unlock_up_safe(path, 1);
642 btrfs_mark_buffer_dirty(path->nodes[0]); 644 btrfs_mark_buffer_dirty(path->nodes[0]);
645 btrfs_set_lock_blocking(path->nodes[0]);
643 646
644 if (disk_bytenr != 0) { 647 if (disk_bytenr != 0) {
645 ret = btrfs_update_extent_ref(trans, root, 648 ret = btrfs_update_extent_ref(trans, root,
646 disk_bytenr, orig_parent, 649 disk_bytenr,
650 le64_to_cpu(old.disk_num_bytes),
651 orig_parent,
647 leaf->start, 652 leaf->start,
648 root->root_key.objectid, 653 root->root_key.objectid,
649 trans->transid, ins.objectid); 654 trans->transid, ins.objectid);
650 655
651 BUG_ON(ret); 656 BUG_ON(ret);
652 } 657 }
658 path->leave_spinning = 0;
653 btrfs_release_path(root, path); 659 btrfs_release_path(root, path);
654 if (disk_bytenr != 0) 660 if (disk_bytenr != 0)
655 inode_add_bytes(inode, extent_end - end); 661 inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
912 btrfs_set_file_extent_other_encoding(leaf, fi, 0); 918 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
913 919
914 if (orig_parent != leaf->start) { 920 if (orig_parent != leaf->start) {
915 ret = btrfs_update_extent_ref(trans, root, bytenr, 921 ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
916 orig_parent, leaf->start, 922 orig_parent, leaf->start,
917 root->root_key.objectid, 923 root->root_key.objectid,
918 trans->transid, inode->i_ino); 924 trans->transid, inode->i_ino);
@@ -1155,6 +1161,20 @@ out_nolock:
1155 page_cache_release(pinned[1]); 1161 page_cache_release(pinned[1]);
1156 *ppos = pos; 1162 *ppos = pos;
1157 1163
1164 /*
1165 * we want to make sure fsync finds this change
1166 * but we haven't joined a transaction running right now.
1167 *
1168 * Later on, someone is sure to update the inode and get the
1169 * real transid recorded.
1170 *
1171 * We set last_trans now to the fs_info generation + 1,
1172 * this will either be one more than the running transaction
1173 * or the generation used for the next transaction if there isn't
1174 * one running right now.
1175 */
1176 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1177
1158 if (num_written > 0 && will_write) { 1178 if (num_written > 0 && will_write) {
1159 struct btrfs_trans_handle *trans; 1179 struct btrfs_trans_handle *trans;
1160 1180
@@ -1167,8 +1187,11 @@ out_nolock:
1167 ret = btrfs_log_dentry_safe(trans, root, 1187 ret = btrfs_log_dentry_safe(trans, root,
1168 file->f_dentry); 1188 file->f_dentry);
1169 if (ret == 0) { 1189 if (ret == 0) {
1170 btrfs_sync_log(trans, root); 1190 ret = btrfs_sync_log(trans, root);
1171 btrfs_end_transaction(trans, root); 1191 if (ret == 0)
1192 btrfs_end_transaction(trans, root);
1193 else
1194 btrfs_commit_transaction(trans, root);
1172 } else { 1195 } else {
1173 btrfs_commit_transaction(trans, root); 1196 btrfs_commit_transaction(trans, root);
1174 } 1197 }
@@ -1185,6 +1208,18 @@ out_nolock:
1185 1208
1186int btrfs_release_file(struct inode *inode, struct file *filp) 1209int btrfs_release_file(struct inode *inode, struct file *filp)
1187{ 1210{
1211 /*
1212 * ordered_data_close is set by settattr when we are about to truncate
1213 * a file from a non-zero size to a zero size. This tries to
1214 * flush down new bytes that may have been written if the
1215 * application were using truncate to replace a file in place.
1216 */
1217 if (BTRFS_I(inode)->ordered_data_close) {
1218 BTRFS_I(inode)->ordered_data_close = 0;
1219 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1220 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1221 filemap_flush(inode->i_mapping);
1222 }
1188 if (filp->private_data) 1223 if (filp->private_data)
1189 btrfs_ioctl_trans_end(filp); 1224 btrfs_ioctl_trans_end(filp);
1190 return 0; 1225 return 0;
@@ -1260,8 +1295,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1260 if (ret > 0) { 1295 if (ret > 0) {
1261 ret = btrfs_commit_transaction(trans, root); 1296 ret = btrfs_commit_transaction(trans, root);
1262 } else { 1297 } else {
1263 btrfs_sync_log(trans, root); 1298 ret = btrfs_sync_log(trans, root);
1264 ret = btrfs_end_transaction(trans, root); 1299 if (ret == 0)
1300 ret = btrfs_end_transaction(trans, root);
1301 else
1302 ret = btrfs_commit_transaction(trans, root);
1265 } 1303 }
1266 mutex_lock(&dentry->d_inode->i_mutex); 1304 mutex_lock(&dentry->d_inode->i_mutex);
1267out: 1305out:
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a..6b627c61180 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
73 if (!path) 73 if (!path)
74 return -ENOMEM; 74 return -ENOMEM;
75 75
76 path->leave_spinning = 1;
77
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 78 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) { 79 if (ret > 0) {
78 ret = -ENOENT; 80 ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
127 if (!path) 129 if (!path)
128 return -ENOMEM; 130 return -ENOMEM;
129 131
132 path->leave_spinning = 1;
130 ret = btrfs_insert_empty_item(trans, root, path, &key, 133 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len); 134 ins_len);
132 if (ret == -EEXIST) { 135 if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22..06d8db5afb0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
134 if (!path) 134 if (!path)
135 return -ENOMEM; 135 return -ENOMEM;
136 136
137 path->leave_spinning = 1;
137 btrfs_set_trans_block_group(trans, inode); 138 btrfs_set_trans_block_group(trans, inode);
138 139
139 key.objectid = inode->i_ino; 140 key.objectid = inode->i_ino;
@@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
167 cur_size = min_t(unsigned long, compressed_size, 168 cur_size = min_t(unsigned long, compressed_size,
168 PAGE_CACHE_SIZE); 169 PAGE_CACHE_SIZE);
169 170
170 kaddr = kmap(cpage); 171 kaddr = kmap_atomic(cpage, KM_USER0);
171 write_extent_buffer(leaf, kaddr, ptr, cur_size); 172 write_extent_buffer(leaf, kaddr, ptr, cur_size);
172 kunmap(cpage); 173 kunmap_atomic(kaddr, KM_USER0);
173 174
174 i++; 175 i++;
175 ptr += cur_size; 176 ptr += cur_size;
@@ -204,7 +205,7 @@ fail:
204 * does the checks required to make sure the data is small enough 205 * does the checks required to make sure the data is small enough
205 * to fit as an inline extent. 206 * to fit as an inline extent.
206 */ 207 */
207static int cow_file_range_inline(struct btrfs_trans_handle *trans, 208static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
208 struct btrfs_root *root, 209 struct btrfs_root *root,
209 struct inode *inode, u64 start, u64 end, 210 struct inode *inode, u64 start, u64 end,
210 size_t compressed_size, 211 size_t compressed_size,
@@ -854,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
854 u64 cur_end; 855 u64 cur_end;
855 int limit = 10 * 1024 * 1042; 856 int limit = 10 * 1024 * 1042;
856 857
857 if (!btrfs_test_opt(root, COMPRESS)) {
858 return cow_file_range(inode, locked_page, start, end,
859 page_started, nr_written, 1);
860 }
861
862 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 858 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
863 EXTENT_DELALLOC, 1, 0, GFP_NOFS); 859 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
864 while (start < end) { 860 while (start < end) {
@@ -935,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
935 * If no cow copies or snapshots exist, we write directly to the existing 931 * If no cow copies or snapshots exist, we write directly to the existing
936 * blocks on disk 932 * blocks on disk
937 */ 933 */
938static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, 934static noinline int run_delalloc_nocow(struct inode *inode,
935 struct page *locked_page,
939 u64 start, u64 end, int *page_started, int force, 936 u64 start, u64 end, int *page_started, int force,
940 unsigned long *nr_written) 937 unsigned long *nr_written)
941{ 938{
@@ -1133,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1133 unsigned long *nr_written) 1130 unsigned long *nr_written)
1134{ 1131{
1135 int ret; 1132 int ret;
1133 struct btrfs_root *root = BTRFS_I(inode)->root;
1136 1134
1137 if (btrfs_test_flag(inode, NODATACOW)) 1135 if (btrfs_test_flag(inode, NODATACOW))
1138 ret = run_delalloc_nocow(inode, locked_page, start, end, 1136 ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1140 else if (btrfs_test_flag(inode, PREALLOC)) 1138 else if (btrfs_test_flag(inode, PREALLOC))
1141 ret = run_delalloc_nocow(inode, locked_page, start, end, 1139 ret = run_delalloc_nocow(inode, locked_page, start, end,
1142 page_started, 0, nr_written); 1140 page_started, 0, nr_written);
1141 else if (!btrfs_test_opt(root, COMPRESS))
1142 ret = cow_file_range(inode, locked_page, start, end,
1143 page_started, nr_written, 1);
1143 else 1144 else
1144 ret = cow_file_range_async(inode, locked_page, start, end, 1145 ret = cow_file_range_async(inode, locked_page, start, end,
1145 page_started, nr_written); 1146 page_started, nr_written);
1146
1147 return ret; 1147 return ret;
1148} 1148}
1149 1149
@@ -1453,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1453 path = btrfs_alloc_path(); 1453 path = btrfs_alloc_path();
1454 BUG_ON(!path); 1454 BUG_ON(!path);
1455 1455
1456 path->leave_spinning = 1;
1456 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1457 file_pos + num_bytes, file_pos, &hint); 1458 file_pos + num_bytes, file_pos, &hint);
1458 BUG_ON(ret); 1459 BUG_ON(ret);
@@ -1475,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1475 btrfs_set_file_extent_compression(leaf, fi, compression); 1476 btrfs_set_file_extent_compression(leaf, fi, compression);
1476 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1477 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1477 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1478 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1479
1480 btrfs_unlock_up_safe(path, 1);
1481 btrfs_set_lock_blocking(leaf);
1482
1478 btrfs_mark_buffer_dirty(leaf); 1483 btrfs_mark_buffer_dirty(leaf);
1479 1484
1480 inode_add_bytes(inode, num_bytes); 1485 inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1487 root->root_key.objectid, 1492 root->root_key.objectid,
1488 trans->transid, inode->i_ino, &ins); 1493 trans->transid, inode->i_ino, &ins);
1489 BUG_ON(ret); 1494 BUG_ON(ret);
1490
1491 btrfs_free_path(path); 1495 btrfs_free_path(path);
1496
1492 return 0; 1497 return 0;
1493} 1498}
1494 1499
1500/*
1501 * helper function for btrfs_finish_ordered_io, this
1502 * just reads in some of the csum leaves to prime them into ram
1503 * before we start the transaction. It limits the amount of btree
1504 * reads required while inside the transaction.
1505 */
1506static noinline void reada_csum(struct btrfs_root *root,
1507 struct btrfs_path *path,
1508 struct btrfs_ordered_extent *ordered_extent)
1509{
1510 struct btrfs_ordered_sum *sum;
1511 u64 bytenr;
1512
1513 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1514 list);
1515 bytenr = sum->sums[0].bytenr;
1516
1517 /*
1518 * we don't care about the results, the point of this search is
1519 * just to get the btree leaves into ram
1520 */
1521 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1522}
1523
1495/* as ordered data IO finishes, this gets called so we can finish 1524/* as ordered data IO finishes, this gets called so we can finish
1496 * an ordered extent if the range of bytes in the file it covers are 1525 * an ordered extent if the range of bytes in the file it covers are
1497 * fully written. 1526 * fully written.
@@ -1500,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1500{ 1529{
1501 struct btrfs_root *root = BTRFS_I(inode)->root; 1530 struct btrfs_root *root = BTRFS_I(inode)->root;
1502 struct btrfs_trans_handle *trans; 1531 struct btrfs_trans_handle *trans;
1503 struct btrfs_ordered_extent *ordered_extent; 1532 struct btrfs_ordered_extent *ordered_extent = NULL;
1504 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1533 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1534 struct btrfs_path *path;
1505 int compressed = 0; 1535 int compressed = 0;
1506 int ret; 1536 int ret;
1507 1537
@@ -1509,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1509 if (!ret) 1539 if (!ret)
1510 return 0; 1540 return 0;
1511 1541
1542 /*
1543 * before we join the transaction, try to do some of our IO.
1544 * This will limit the amount of IO that we have to do with
1545 * the transaction running. We're unlikely to need to do any
1546 * IO if the file extents are new, the disk_i_size checks
1547 * covers the most common case.
1548 */
1549 if (start < BTRFS_I(inode)->disk_i_size) {
1550 path = btrfs_alloc_path();
1551 if (path) {
1552 ret = btrfs_lookup_file_extent(NULL, root, path,
1553 inode->i_ino,
1554 start, 0);
1555 ordered_extent = btrfs_lookup_ordered_extent(inode,
1556 start);
1557 if (!list_empty(&ordered_extent->list)) {
1558 btrfs_release_path(root, path);
1559 reada_csum(root, path, ordered_extent);
1560 }
1561 btrfs_free_path(path);
1562 }
1563 }
1564
1512 trans = btrfs_join_transaction(root, 1); 1565 trans = btrfs_join_transaction(root, 1);
1513 1566
1514 ordered_extent = btrfs_lookup_ordered_extent(inode, start); 1567 if (!ordered_extent)
1568 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1515 BUG_ON(!ordered_extent); 1569 BUG_ON(!ordered_extent);
1516 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 1570 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1517 goto nocow; 1571 goto nocow;
@@ -2101,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2101 2155
2102 path = btrfs_alloc_path(); 2156 path = btrfs_alloc_path();
2103 BUG_ON(!path); 2157 BUG_ON(!path);
2158 path->leave_spinning = 1;
2104 ret = btrfs_lookup_inode(trans, root, path, 2159 ret = btrfs_lookup_inode(trans, root, path,
2105 &BTRFS_I(inode)->location, 1); 2160 &BTRFS_I(inode)->location, 1);
2106 if (ret) { 2161 if (ret) {
@@ -2147,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2147 goto err; 2202 goto err;
2148 } 2203 }
2149 2204
2205 path->leave_spinning = 1;
2150 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2206 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2151 name, name_len, -1); 2207 name, name_len, -1);
2152 if (IS_ERR(di)) { 2208 if (IS_ERR(di)) {
@@ -2190,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2190 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2246 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2191 inode, dir->i_ino); 2247 inode, dir->i_ino);
2192 BUG_ON(ret != 0 && ret != -ENOENT); 2248 BUG_ON(ret != 0 && ret != -ENOENT);
2193 if (ret != -ENOENT)
2194 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2195 2249
2196 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2250 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2197 dir, index); 2251 dir, index);
@@ -2224,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2224 trans = btrfs_start_transaction(root, 1); 2278 trans = btrfs_start_transaction(root, 1);
2225 2279
2226 btrfs_set_trans_block_group(trans, dir); 2280 btrfs_set_trans_block_group(trans, dir);
2281
2282 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2283
2227 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2284 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2228 dentry->d_name.name, dentry->d_name.len); 2285 dentry->d_name.name, dentry->d_name.len);
2229 2286
@@ -2498,6 +2555,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2498 key.type = (u8)-1; 2555 key.type = (u8)-1;
2499 2556
2500search_again: 2557search_again:
2558 path->leave_spinning = 1;
2501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2559 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2502 if (ret < 0) 2560 if (ret < 0)
2503 goto error; 2561 goto error;
@@ -2644,6 +2702,7 @@ delete:
2644 break; 2702 break;
2645 } 2703 }
2646 if (found_extent) { 2704 if (found_extent) {
2705 btrfs_set_path_blocking(path);
2647 ret = btrfs_free_extent(trans, root, extent_start, 2706 ret = btrfs_free_extent(trans, root, extent_start,
2648 extent_num_bytes, 2707 extent_num_bytes,
2649 leaf->start, root_owner, 2708 leaf->start, root_owner,
@@ -2848,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2848 if (err) 2907 if (err)
2849 return err; 2908 return err;
2850 2909
2851 if (S_ISREG(inode->i_mode) && 2910 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
2852 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { 2911 if (attr->ia_size > inode->i_size) {
2853 err = btrfs_cont_expand(inode, attr->ia_size); 2912 err = btrfs_cont_expand(inode, attr->ia_size);
2854 if (err) 2913 if (err)
2855 return err; 2914 return err;
2915 } else if (inode->i_size > 0 &&
2916 attr->ia_size == 0) {
2917
2918 /* we're truncating a file that used to have good
2919 * data down to zero. Make sure it gets into
2920 * the ordered flush list so that any new writes
2921 * get down to disk quickly.
2922 */
2923 BTRFS_I(inode)->ordered_data_close = 1;
2924 }
2856 } 2925 }
2857 2926
2858 err = inode_setattr(inode, attr); 2927 err = inode_setattr(inode, attr);
@@ -2984,13 +3053,14 @@ static noinline void init_btrfs_i(struct inode *inode)
2984 bi->disk_i_size = 0; 3053 bi->disk_i_size = 0;
2985 bi->flags = 0; 3054 bi->flags = 0;
2986 bi->index_cnt = (u64)-1; 3055 bi->index_cnt = (u64)-1;
2987 bi->log_dirty_trans = 0; 3056 bi->last_unlink_trans = 0;
2988 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3057 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2989 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3058 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2990 inode->i_mapping, GFP_NOFS); 3059 inode->i_mapping, GFP_NOFS);
2991 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3060 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2992 inode->i_mapping, GFP_NOFS); 3061 inode->i_mapping, GFP_NOFS);
2993 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3062 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3063 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
2994 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3064 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2995 mutex_init(&BTRFS_I(inode)->extent_mutex); 3065 mutex_init(&BTRFS_I(inode)->extent_mutex);
2996 mutex_init(&BTRFS_I(inode)->log_mutex); 3066 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3449,6 +3519,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3449 sizes[0] = sizeof(struct btrfs_inode_item); 3519 sizes[0] = sizeof(struct btrfs_inode_item);
3450 sizes[1] = name_len + sizeof(*ref); 3520 sizes[1] = name_len + sizeof(*ref);
3451 3521
3522 path->leave_spinning = 1;
3452 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 3523 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3453 if (ret != 0) 3524 if (ret != 0)
3454 goto fail; 3525 goto fail;
@@ -3727,6 +3798,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3727 drop_inode = 1; 3798 drop_inode = 1;
3728 3799
3729 nr = trans->blocks_used; 3800 nr = trans->blocks_used;
3801
3802 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3730 btrfs_end_transaction_throttle(trans, root); 3803 btrfs_end_transaction_throttle(trans, root);
3731fail: 3804fail:
3732 if (drop_inode) { 3805 if (drop_inode) {
@@ -4292,8 +4365,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4292 * beyond EOF, then the page is guaranteed safe against truncation until we 4365 * beyond EOF, then the page is guaranteed safe against truncation until we
4293 * unlock the page. 4366 * unlock the page.
4294 */ 4367 */
4295int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) 4368int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4296{ 4369{
4370 struct page *page = vmf->page;
4297 struct inode *inode = fdentry(vma->vm_file)->d_inode; 4371 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4298 struct btrfs_root *root = BTRFS_I(inode)->root; 4372 struct btrfs_root *root = BTRFS_I(inode)->root;
4299 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4373 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4306,10 +4380,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4306 u64 page_end; 4380 u64 page_end;
4307 4381
4308 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 4382 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
4309 if (ret) 4383 if (ret) {
4384 if (ret == -ENOMEM)
4385 ret = VM_FAULT_OOM;
4386 else /* -ENOSPC, -EIO, etc */
4387 ret = VM_FAULT_SIGBUS;
4310 goto out; 4388 goto out;
4389 }
4311 4390
4312 ret = -EINVAL; 4391 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4313again: 4392again:
4314 lock_page(page); 4393 lock_page(page);
4315 size = i_size_read(inode); 4394 size = i_size_read(inode);
@@ -4357,6 +4436,8 @@ again:
4357 } 4436 }
4358 ClearPageChecked(page); 4437 ClearPageChecked(page);
4359 set_page_dirty(page); 4438 set_page_dirty(page);
4439
4440 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4360 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4441 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4361 4442
4362out_unlock: 4443out_unlock:
@@ -4382,6 +4463,27 @@ static void btrfs_truncate(struct inode *inode)
4382 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 4463 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4383 4464
4384 trans = btrfs_start_transaction(root, 1); 4465 trans = btrfs_start_transaction(root, 1);
4466
4467 /*
4468 * setattr is responsible for setting the ordered_data_close flag,
4469 * but that is only tested during the last file release. That
4470 * could happen well after the next commit, leaving a great big
4471 * window where new writes may get lost if someone chooses to write
4472 * to this file after truncating to zero
4473 *
4474 * The inode doesn't have any dirty data here, and so if we commit
4475 * this is a noop. If someone immediately starts writing to the inode
4476 * it is very likely we'll catch some of their writes in this
4477 * transaction, and the commit will find this file on the ordered
4478 * data list with good things to send down.
4479 *
4480 * This is a best effort solution, there is still a window where
4481 * using truncate to replace the contents of the file will
4482 * end up with a zero length file after a crash.
4483 */
4484 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
4485 btrfs_add_ordered_operation(trans, root, inode);
4486
4385 btrfs_set_trans_block_group(trans, inode); 4487 btrfs_set_trans_block_group(trans, inode);
4386 btrfs_i_size_write(inode, inode->i_size); 4488 btrfs_i_size_write(inode, inode->i_size);
4387 4489
@@ -4458,12 +4560,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4458 ei->i_acl = BTRFS_ACL_NOT_CACHED; 4560 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4459 ei->i_default_acl = BTRFS_ACL_NOT_CACHED; 4561 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4460 INIT_LIST_HEAD(&ei->i_orphan); 4562 INIT_LIST_HEAD(&ei->i_orphan);
4563 INIT_LIST_HEAD(&ei->ordered_operations);
4461 return &ei->vfs_inode; 4564 return &ei->vfs_inode;
4462} 4565}
4463 4566
4464void btrfs_destroy_inode(struct inode *inode) 4567void btrfs_destroy_inode(struct inode *inode)
4465{ 4568{
4466 struct btrfs_ordered_extent *ordered; 4569 struct btrfs_ordered_extent *ordered;
4570 struct btrfs_root *root = BTRFS_I(inode)->root;
4571
4467 WARN_ON(!list_empty(&inode->i_dentry)); 4572 WARN_ON(!list_empty(&inode->i_dentry));
4468 WARN_ON(inode->i_data.nrpages); 4573 WARN_ON(inode->i_data.nrpages);
4469 4574
@@ -4474,13 +4579,24 @@ void btrfs_destroy_inode(struct inode *inode)
4474 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) 4579 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4475 posix_acl_release(BTRFS_I(inode)->i_default_acl); 4580 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4476 4581
4477 spin_lock(&BTRFS_I(inode)->root->list_lock); 4582 /*
4583 * Make sure we're properly removed from the ordered operation
4584 * lists.
4585 */
4586 smp_mb();
4587 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
4588 spin_lock(&root->fs_info->ordered_extent_lock);
4589 list_del_init(&BTRFS_I(inode)->ordered_operations);
4590 spin_unlock(&root->fs_info->ordered_extent_lock);
4591 }
4592
4593 spin_lock(&root->list_lock);
4478 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 4594 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4479 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 4595 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4480 " list\n", inode->i_ino); 4596 " list\n", inode->i_ino);
4481 dump_stack(); 4597 dump_stack();
4482 } 4598 }
4483 spin_unlock(&BTRFS_I(inode)->root->list_lock); 4599 spin_unlock(&root->list_lock);
4484 4600
4485 while (1) { 4601 while (1) {
4486 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 4602 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4605,8 +4721,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4605 if (ret) 4721 if (ret)
4606 goto out_unlock; 4722 goto out_unlock;
4607 4723
4724 /*
4725 * we're using rename to replace one file with another.
4726 * and the replacement file is large. Start IO on it now so
4727 * we don't add too much work to the end of the transaction
4728 */
4729 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
4730 new_inode->i_size &&
4731 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4732 filemap_flush(old_inode->i_mapping);
4733
4608 trans = btrfs_start_transaction(root, 1); 4734 trans = btrfs_start_transaction(root, 1);
4609 4735
4736 /*
4737 * make sure the inode gets flushed if it is replacing
4738 * something.
4739 */
4740 if (new_inode && new_inode->i_size &&
4741 old_inode && S_ISREG(old_inode->i_mode)) {
4742 btrfs_add_ordered_operation(trans, root, old_inode);
4743 }
4744
4745 /*
4746 * this is an ugly little race, but the rename is required to make
4747 * sure that if we crash, the inode is either at the old name
4748 * or the new one. pinning the log transaction lets us make sure
4749 * we don't allow a log commit to come in after we unlink the
4750 * name but before we add the new name back in.
4751 */
4752 btrfs_pin_log_trans(root);
4753
4610 btrfs_set_trans_block_group(trans, new_dir); 4754 btrfs_set_trans_block_group(trans, new_dir);
4611 4755
4612 btrfs_inc_nlink(old_dentry->d_inode); 4756 btrfs_inc_nlink(old_dentry->d_inode);
@@ -4614,6 +4758,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4614 new_dir->i_ctime = new_dir->i_mtime = ctime; 4758 new_dir->i_ctime = new_dir->i_mtime = ctime;
4615 old_inode->i_ctime = ctime; 4759 old_inode->i_ctime = ctime;
4616 4760
4761 if (old_dentry->d_parent != new_dentry->d_parent)
4762 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4763
4617 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 4764 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4618 old_dentry->d_name.name, 4765 old_dentry->d_name.name,
4619 old_dentry->d_name.len); 4766 old_dentry->d_name.len);
@@ -4645,7 +4792,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4645 if (ret) 4792 if (ret)
4646 goto out_fail; 4793 goto out_fail;
4647 4794
4795 btrfs_log_new_name(trans, old_inode, old_dir,
4796 new_dentry->d_parent);
4648out_fail: 4797out_fail:
4798
4799 /* this btrfs_end_log_trans just allows the current
4800 * log-sub transaction to complete
4801 */
4802 btrfs_end_log_trans(root);
4649 btrfs_end_transaction_throttle(trans, root); 4803 btrfs_end_transaction_throttle(trans, root);
4650out_unlock: 4804out_unlock:
4651 return ret; 4805 return ret;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a..a5310c0f41e 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
71static int btrfs_spin_on_block(struct extent_buffer *eb) 71static int btrfs_spin_on_block(struct extent_buffer *eb)
72{ 72{
73 int i; 73 int i;
74
74 for (i = 0; i < 512; i++) { 75 for (i = 0; i < 512; i++) {
75 cpu_relax();
76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
77 return 1; 77 return 1;
78 if (need_resched()) 78 if (need_resched())
79 break; 79 break;
80 cpu_relax();
80 } 81 }
81 return 0; 82 return 0;
82} 83}
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
95{ 96{
96 int i; 97 int i;
97 98
98 spin_nested(eb); 99 if (btrfs_spin_on_block(eb)) {
99 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 100 spin_nested(eb);
100 return 1; 101 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
101 spin_unlock(&eb->lock); 102 return 1;
102 103 spin_unlock(&eb->lock);
104 }
103 /* spin for a bit on the BLOCKING flag */ 105 /* spin for a bit on the BLOCKING flag */
104 for (i = 0; i < 2; i++) { 106 for (i = 0; i < 2; i++) {
107 cpu_relax();
105 if (!btrfs_spin_on_block(eb)) 108 if (!btrfs_spin_on_block(eb))
106 break; 109 break;
107 110
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
148 DEFINE_WAIT(wait); 151 DEFINE_WAIT(wait);
149 wait.func = btrfs_wake_function; 152 wait.func = btrfs_wake_function;
150 153
154 if (!btrfs_spin_on_block(eb))
155 goto sleep;
156
151 while(1) { 157 while(1) {
152 spin_nested(eb); 158 spin_nested(eb);
153 159
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
165 * spin for a bit, and if the blocking flag goes away, 171 * spin for a bit, and if the blocking flag goes away,
166 * loop around 172 * loop around
167 */ 173 */
174 cpu_relax();
168 if (btrfs_spin_on_block(eb)) 175 if (btrfs_spin_on_block(eb))
169 continue; 176 continue;
170 177sleep:
171 prepare_to_wait_exclusive(&eb->lock_wq, &wait, 178 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
172 TASK_UNINTERRUPTIBLE); 179 TASK_UNINTERRUPTIBLE);
173 180
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0..53c87b197d7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
310 310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list); 312 list_del_init(&entry->root_extent_list);
313
314 /*
315 * we have no more ordered extents for this inode and
316 * no dirty pages. We can safely remove it from the
317 * list of ordered extents
318 */
319 if (RB_EMPTY_ROOT(&tree->tree) &&
320 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
321 list_del_init(&BTRFS_I(inode)->ordered_operations);
322 }
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 323 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314 324
315 mutex_unlock(&tree->mutex); 325 mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
370} 380}
371 381
372/* 382/*
383 * this is used during transaction commit to write all the inodes
384 * added to the ordered operation list. These files must be fully on
385 * disk before the transaction commits.
386 *
387 * we have two modes here, one is to just start the IO via filemap_flush
388 * and the other is to wait for all the io. When we wait, we have an
389 * extra check to make sure the ordered operation list really is empty
390 * before we return
391 */
392int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
393{
394 struct btrfs_inode *btrfs_inode;
395 struct inode *inode;
396 struct list_head splice;
397
398 INIT_LIST_HEAD(&splice);
399
400 mutex_lock(&root->fs_info->ordered_operations_mutex);
401 spin_lock(&root->fs_info->ordered_extent_lock);
402again:
403 list_splice_init(&root->fs_info->ordered_operations, &splice);
404
405 while (!list_empty(&splice)) {
406 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
407 ordered_operations);
408
409 inode = &btrfs_inode->vfs_inode;
410
411 list_del_init(&btrfs_inode->ordered_operations);
412
413 /*
414 * the inode may be getting freed (in sys_unlink path).
415 */
416 inode = igrab(inode);
417
418 if (!wait && inode) {
419 list_add_tail(&BTRFS_I(inode)->ordered_operations,
420 &root->fs_info->ordered_operations);
421 }
422 spin_unlock(&root->fs_info->ordered_extent_lock);
423
424 if (inode) {
425 if (wait)
426 btrfs_wait_ordered_range(inode, 0, (u64)-1);
427 else
428 filemap_flush(inode->i_mapping);
429 iput(inode);
430 }
431
432 cond_resched();
433 spin_lock(&root->fs_info->ordered_extent_lock);
434 }
435 if (wait && !list_empty(&root->fs_info->ordered_operations))
436 goto again;
437
438 spin_unlock(&root->fs_info->ordered_extent_lock);
439 mutex_unlock(&root->fs_info->ordered_operations_mutex);
440
441 return 0;
442}
443
444/*
373 * Used to start IO or wait for a given ordered extent to finish. 445 * Used to start IO or wait for a given ordered extent to finish.
374 * 446 *
375 * If wait is one, this effectively waits on page writeback for all the pages 447 * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
726 798
727 return ret; 799 return ret;
728} 800}
801
802/*
803 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes.
805 *
806 * This basically gives us the ext3 style data=ordered mode, and it is mostly
807 * used to make sure renamed files are fully on disk.
808 *
809 * It is a noop if the inode is already fully on disk.
810 *
811 * If trans is not null, we'll do a friendly check for a transaction that
812 * is already flushing things and force the IO down ourselves.
813 */
814int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
815 struct btrfs_root *root,
816 struct inode *inode)
817{
818 u64 last_mod;
819
820 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
821
822 /*
823 * if this file hasn't been changed since the last transaction
824 * commit, we can safely return without doing anything
825 */
826 if (last_mod < root->fs_info->last_trans_committed)
827 return 0;
828
829 /*
830 * the transaction is already committing. Just start the IO and
831 * don't bother with all of this list nonsense
832 */
833 if (trans && root->fs_info->running_transaction->blocked) {
834 btrfs_wait_ordered_range(inode, 0, (u64)-1);
835 return 0;
836 }
837
838 spin_lock(&root->fs_info->ordered_extent_lock);
839 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
840 list_add_tail(&BTRFS_I(inode)->ordered_operations,
841 &root->fs_info->ordered_operations);
842 }
843 spin_unlock(&root->fs_info->ordered_extent_lock);
844
845 return 0;
846}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d..3d31c8827b0 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode); 156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
159int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
160 struct btrfs_root *root,
161 struct inode *inode);
158#endif 162#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4..664782c6a2d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -65,6 +65,15 @@ static noinline int join_transaction(struct btrfs_root *root)
65 cur_trans->use_count = 1; 65 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0; 66 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds(); 67 cur_trans->start_time = get_seconds();
68
69 cur_trans->delayed_refs.root.rb_node = NULL;
70 cur_trans->delayed_refs.num_entries = 0;
71 cur_trans->delayed_refs.num_heads_ready = 0;
72 cur_trans->delayed_refs.num_heads = 0;
73 cur_trans->delayed_refs.flushing = 0;
74 cur_trans->delayed_refs.run_delayed_start = 0;
75 spin_lock_init(&cur_trans->delayed_refs.lock);
76
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 77 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 78 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages, 79 extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +191,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
182 h->block_group = 0; 191 h->block_group = 0;
183 h->alloc_exclude_nr = 0; 192 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0; 193 h->alloc_exclude_start = 0;
194 h->delayed_ref_updates = 0;
195
185 root->fs_info->running_transaction->use_count++; 196 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex); 197 mutex_unlock(&root->fs_info->trans_mutex);
187 return h; 198 return h;
@@ -271,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root)
271 if (!root->fs_info->open_ioctl_trans) 282 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root); 283 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex); 284 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root); 285 throttle_on_drops(root);
276} 286}
277 287
@@ -280,6 +290,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
280{ 290{
281 struct btrfs_transaction *cur_trans; 291 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info; 292 struct btrfs_fs_info *info = root->fs_info;
293 int count = 0;
294
295 while (count < 4) {
296 unsigned long cur = trans->delayed_ref_updates;
297 trans->delayed_ref_updates = 0;
298 if (cur &&
299 trans->transaction->delayed_refs.num_heads_ready > 64) {
300 trans->delayed_ref_updates = 0;
301
302 /*
303 * do a full flush if the transaction is trying
304 * to close
305 */
306 if (trans->transaction->delayed_refs.flushing)
307 cur = 0;
308 btrfs_run_delayed_refs(trans, root, cur);
309 } else {
310 break;
311 }
312 count++;
313 }
283 314
284 mutex_lock(&info->trans_mutex); 315 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction; 316 cur_trans = info->running_transaction;
@@ -424,9 +455,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
424 u64 old_root_bytenr; 455 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root; 456 struct btrfs_root *tree_root = root->fs_info->tree_root;
426 457
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root); 458 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root); 459
460 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
461 BUG_ON(ret);
430 462
431 while (1) { 463 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 464 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +470,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
438 btrfs_header_level(root->node)); 470 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid); 471 btrfs_set_root_generation(&root->root_item, trans->transid);
440 472
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root, 473 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key, 474 &root->root_key,
445 &root->root_item); 475 &root->root_item);
446 BUG_ON(ret); 476 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root); 477 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root); 478
479 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
480 BUG_ON(ret);
449 } 481 }
450 return 0; 482 return 0;
451} 483}
@@ -459,15 +491,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
459 struct btrfs_fs_info *fs_info = root->fs_info; 491 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next; 492 struct list_head *next;
461 struct extent_buffer *eb; 493 struct extent_buffer *eb;
494 int ret;
462 495
463 btrfs_extent_post_op(trans, fs_info->tree_root); 496 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
497 BUG_ON(ret);
464 498
465 eb = btrfs_lock_root_node(fs_info->tree_root); 499 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); 500 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
467 btrfs_tree_unlock(eb); 501 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb); 502 free_extent_buffer(eb);
469 503
470 btrfs_extent_post_op(trans, fs_info->tree_root); 504 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
505 BUG_ON(ret);
471 506
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 507 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next; 508 next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +510,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
475 root = list_entry(next, struct btrfs_root, dirty_list); 510 root = list_entry(next, struct btrfs_root, dirty_list);
476 511
477 update_cowonly_root(trans, root); 512 update_cowonly_root(trans, root);
513
514 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
515 BUG_ON(ret);
478 } 516 }
479 return 0; 517 return 0;
480} 518}
@@ -635,6 +673,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
635} 673}
636 674
637/* 675/*
676 * when dropping snapshots, we generate a ton of delayed refs, and it makes
677 * sense not to join the transaction while it is trying to flush the current
678 * queue of delayed refs out.
679 *
680 * This is used by the drop snapshot code only
681 */
682static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
683{
684 DEFINE_WAIT(wait);
685
686 mutex_lock(&info->trans_mutex);
687 while (info->running_transaction &&
688 info->running_transaction->delayed_refs.flushing) {
689 prepare_to_wait(&info->transaction_wait, &wait,
690 TASK_UNINTERRUPTIBLE);
691 mutex_unlock(&info->trans_mutex);
692 schedule();
693 mutex_lock(&info->trans_mutex);
694 finish_wait(&info->transaction_wait, &wait);
695 }
696 mutex_unlock(&info->trans_mutex);
697 return 0;
698}
699
700/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 701 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them 702 * all of them
640 */ 703 */
@@ -661,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
661 atomic_inc(&root->fs_info->throttles); 724 atomic_inc(&root->fs_info->throttles);
662 725
663 while (1) { 726 while (1) {
727 /*
728 * we don't want to jump in and create a bunch of
729 * delayed refs if the transaction is starting to close
730 */
731 wait_transaction_pre_flush(tree_root->fs_info);
664 trans = btrfs_start_transaction(tree_root, 1); 732 trans = btrfs_start_transaction(tree_root, 1);
733
734 /*
735 * we've joined a transaction, make sure it isn't
736 * closing right now
737 */
738 if (trans->transaction->delayed_refs.flushing) {
739 btrfs_end_transaction(trans, tree_root);
740 continue;
741 }
742
665 mutex_lock(&root->fs_info->drop_mutex); 743 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root); 744 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN) 745 if (ret != -EAGAIN)
@@ -766,7 +844,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
766 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 844 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
767 845
768 old = btrfs_lock_root_node(root); 846 old = btrfs_lock_root_node(root);
769 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); 847 btrfs_cow_block(trans, root, old, NULL, 0, &old);
770 848
771 btrfs_copy_root(trans, root, old, &tmp, objectid); 849 btrfs_copy_root(trans, root, old, &tmp, objectid);
772 btrfs_tree_unlock(old); 850 btrfs_tree_unlock(old);
@@ -894,12 +972,31 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
894 struct extent_io_tree *pinned_copy; 972 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait); 973 DEFINE_WAIT(wait);
896 int ret; 974 int ret;
975 int should_grow = 0;
976 unsigned long now = get_seconds();
977
978 btrfs_run_ordered_operations(root, 0);
979
980 /* make a pass through all the delayed refs we have so far
981 * any runnings procs may add more while we are here
982 */
983 ret = btrfs_run_delayed_refs(trans, root, 0);
984 BUG_ON(ret);
985
986 cur_trans = trans->transaction;
987 /*
988 * set the flushing flag so procs in this transaction have to
989 * start sending their work down.
990 */
991 cur_trans->delayed_refs.flushing = 1;
992
993 ret = btrfs_run_delayed_refs(trans, root, 0);
994 BUG_ON(ret);
897 995
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex); 996 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) { 997 INIT_LIST_HEAD(&dirty_fs_roots);
901 cur_trans = trans->transaction; 998 if (cur_trans->in_commit) {
902 trans->transaction->use_count++; 999 cur_trans->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex); 1000 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root); 1001 btrfs_end_transaction(trans, root);
905 1002
@@ -922,7 +1019,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
922 1019
923 trans->transaction->in_commit = 1; 1020 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1; 1021 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1022 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev, 1023 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list); 1024 struct btrfs_transaction, list);
@@ -937,6 +1033,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
937 } 1033 }
938 } 1034 }
939 1035
1036 if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1037 should_grow = 1;
1038
940 do { 1039 do {
941 int snap_pending = 0; 1040 int snap_pending = 0;
942 joined = cur_trans->num_joined; 1041 joined = cur_trans->num_joined;
@@ -949,7 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
949 1048
950 if (cur_trans->num_writers > 1) 1049 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT; 1050 timeout = MAX_SCHEDULE_TIMEOUT;
952 else 1051 else if (should_grow)
953 timeout = 1; 1052 timeout = 1;
954 1053
955 mutex_unlock(&root->fs_info->trans_mutex); 1054 mutex_unlock(&root->fs_info->trans_mutex);
@@ -959,16 +1058,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
959 BUG_ON(ret); 1058 BUG_ON(ret);
960 } 1059 }
961 1060
962 schedule_timeout(timeout); 1061 /*
1062 * rename don't use btrfs_join_transaction, so, once we
1063 * set the transaction to blocked above, we aren't going
1064 * to get any new ordered operations. We can safely run
1065 * it here and no for sure that nothing new will be added
1066 * to the list
1067 */
1068 btrfs_run_ordered_operations(root, 1);
1069
1070 smp_mb();
1071 if (cur_trans->num_writers > 1 || should_grow)
1072 schedule_timeout(timeout);
963 1073
964 mutex_lock(&root->fs_info->trans_mutex); 1074 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait); 1075 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 || 1076 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined)); 1077 (should_grow && cur_trans->num_joined != joined));
968 1078
969 ret = create_pending_snapshots(trans, root->fs_info); 1079 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret); 1080 BUG_ON(ret);
971 1081
1082 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1083 BUG_ON(ret);
1084
972 WARN_ON(cur_trans != trans->transaction); 1085 WARN_ON(cur_trans != trans->transaction);
973 1086
974 /* btrfs_commit_tree_roots is responsible for getting the 1087 /* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1145,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1032 btrfs_copy_pinned(root, pinned_copy); 1145 btrfs_copy_pinned(root, pinned_copy);
1033 1146
1034 trans->transaction->blocked = 0; 1147 trans->transaction->blocked = 0;
1148
1035 wake_up(&root->fs_info->transaction_throttle); 1149 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait); 1150 wake_up(&root->fs_info->transaction_wait);
1037 1151
@@ -1058,6 +1172,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1058 mutex_lock(&root->fs_info->trans_mutex); 1172 mutex_lock(&root->fs_info->trans_mutex);
1059 1173
1060 cur_trans->commit_done = 1; 1174 cur_trans->commit_done = 1;
1175
1061 root->fs_info->last_trans_committed = cur_trans->transid; 1176 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait); 1177 wake_up(&cur_trans->commit_wait);
1063 1178
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f88..94f5bde2b58 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
19#ifndef __BTRFS_TRANSACTION__ 19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__ 20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h" 21#include "btrfs_inode.h"
22#include "delayed-ref.h"
22 23
23struct btrfs_transaction { 24struct btrfs_transaction {
24 u64 transid; 25 u64 transid;
26 /*
27 * total writers in this transaction, it must be zero before the
28 * transaction can end
29 */
25 unsigned long num_writers; 30 unsigned long num_writers;
31
26 unsigned long num_joined; 32 unsigned long num_joined;
27 int in_commit; 33 int in_commit;
28 int use_count; 34 int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
34 wait_queue_head_t writer_wait; 40 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait; 41 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots; 42 struct list_head pending_snapshots;
43 struct btrfs_delayed_ref_root delayed_refs;
37}; 44};
38 45
39struct btrfs_trans_handle { 46struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
44 u64 block_group; 51 u64 block_group;
45 u64 alloc_exclude_start; 52 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr; 53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates;
47}; 55};
48 56
49struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570..b10eacdb162 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
124 } 124 }
125 125
126 btrfs_release_path(root, path); 126 btrfs_release_path(root, path);
127 if (is_extent)
128 btrfs_extent_post_op(trans, root);
129out: 127out:
130 if (path) 128 if (path)
131 btrfs_free_path(path); 129 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60f..fc9b87a7975 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -203,7 +266,6 @@ static int process_one_buffer(struct btrfs_root *log,
203 mutex_lock(&log->fs_info->pinned_mutex); 266 mutex_lock(&log->fs_info->pinned_mutex);
204 btrfs_update_pinned_extents(log->fs_info->extent_root, 267 btrfs_update_pinned_extents(log->fs_info->extent_root,
205 eb->start, eb->len, 1); 268 eb->start, eb->len, 1);
206 mutex_unlock(&log->fs_info->pinned_mutex);
207 } 269 }
208 270
209 if (btrfs_buffer_uptodate(eb, gen)) { 271 if (btrfs_buffer_uptodate(eb, gen)) {
@@ -603,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
603 665
604 ret = link_to_fixup_dir(trans, root, path, location.objectid); 666 ret = link_to_fixup_dir(trans, root, path, location.objectid);
605 BUG_ON(ret); 667 BUG_ON(ret);
668
606 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 669 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
607 BUG_ON(ret); 670 BUG_ON(ret);
608 kfree(name); 671 kfree(name);
@@ -804,6 +867,7 @@ conflict_again:
804 victim_name_len)) { 867 victim_name_len)) {
805 btrfs_inc_nlink(inode); 868 btrfs_inc_nlink(inode);
806 btrfs_release_path(root, path); 869 btrfs_release_path(root, path);
870
807 ret = btrfs_unlink_inode(trans, root, dir, 871 ret = btrfs_unlink_inode(trans, root, dir,
808 inode, victim_name, 872 inode, victim_name,
809 victim_name_len); 873 victim_name_len);
@@ -922,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
922 key.offset--; 986 key.offset--;
923 btrfs_release_path(root, path); 987 btrfs_release_path(root, path);
924 } 988 }
925 btrfs_free_path(path); 989 btrfs_release_path(root, path);
926 if (nlink != inode->i_nlink) { 990 if (nlink != inode->i_nlink) {
927 inode->i_nlink = nlink; 991 inode->i_nlink = nlink;
928 btrfs_update_inode(trans, root, inode); 992 btrfs_update_inode(trans, root, inode);
929 } 993 }
930 BTRFS_I(inode)->index_cnt = (u64)-1; 994 BTRFS_I(inode)->index_cnt = (u64)-1;
931 995
996 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
997 ret = replay_dir_deletes(trans, root, NULL, path,
998 inode->i_ino, 1);
999 BUG_ON(ret);
1000 }
1001 btrfs_free_path(path);
1002
932 return 0; 1003 return 0;
933} 1004}
934 1005
@@ -971,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
971 1042
972 iput(inode); 1043 iput(inode);
973 1044
974 if (key.offset == 0) 1045 /*
975 break; 1046 * fixup on a directory may create new entries,
976 key.offset--; 1047 * make sure we always look for the highset possible
1048 * offset
1049 */
1050 key.offset = (u64)-1;
977 } 1051 }
978 btrfs_release_path(root, path); 1052 btrfs_release_path(root, path);
979 return 0; 1053 return 0;
@@ -1313,11 +1387,11 @@ again:
1313 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1387 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1314 name_len); 1388 name_len);
1315 log_di = NULL; 1389 log_di = NULL;
1316 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1390 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1317 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1391 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1318 dir_key->objectid, 1392 dir_key->objectid,
1319 name, name_len, 0); 1393 name, name_len, 0);
1320 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1394 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1321 log_di = btrfs_lookup_dir_index_item(trans, log, 1395 log_di = btrfs_lookup_dir_index_item(trans, log,
1322 log_path, 1396 log_path,
1323 dir_key->objectid, 1397 dir_key->objectid,
@@ -1378,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1378 struct btrfs_root *root, 1452 struct btrfs_root *root,
1379 struct btrfs_root *log, 1453 struct btrfs_root *log,
1380 struct btrfs_path *path, 1454 struct btrfs_path *path,
1381 u64 dirid) 1455 u64 dirid, int del_all)
1382{ 1456{
1383 u64 range_start; 1457 u64 range_start;
1384 u64 range_end; 1458 u64 range_end;
@@ -1408,10 +1482,14 @@ again:
1408 range_start = 0; 1482 range_start = 0;
1409 range_end = 0; 1483 range_end = 0;
1410 while (1) { 1484 while (1) {
1411 ret = find_dir_range(log, path, dirid, key_type, 1485 if (del_all)
1412 &range_start, &range_end); 1486 range_end = (u64)-1;
1413 if (ret != 0) 1487 else {
1414 break; 1488 ret = find_dir_range(log, path, dirid, key_type,
1489 &range_start, &range_end);
1490 if (ret != 0)
1491 break;
1492 }
1415 1493
1416 dir_key.offset = range_start; 1494 dir_key.offset = range_start;
1417 while (1) { 1495 while (1) {
@@ -1437,7 +1515,8 @@ again:
1437 break; 1515 break;
1438 1516
1439 ret = check_item_in_log(trans, root, log, path, 1517 ret = check_item_in_log(trans, root, log, path,
1440 log_path, dir, &found_key); 1518 log_path, dir,
1519 &found_key);
1441 BUG_ON(ret); 1520 BUG_ON(ret);
1442 if (found_key.offset == (u64)-1) 1521 if (found_key.offset == (u64)-1)
1443 break; 1522 break;
@@ -1514,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1514 mode = btrfs_inode_mode(eb, inode_item); 1593 mode = btrfs_inode_mode(eb, inode_item);
1515 if (S_ISDIR(mode)) { 1594 if (S_ISDIR(mode)) {
1516 ret = replay_dir_deletes(wc->trans, 1595 ret = replay_dir_deletes(wc->trans,
1517 root, log, path, key.objectid); 1596 root, log, path, key.objectid, 0);
1518 BUG_ON(ret); 1597 BUG_ON(ret);
1519 } 1598 }
1520 ret = overwrite_item(wc->trans, root, path, 1599 ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1612,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1533 root, inode, inode->i_size, 1612 root, inode, inode->i_size,
1534 BTRFS_EXTENT_DATA_KEY); 1613 BTRFS_EXTENT_DATA_KEY);
1535 BUG_ON(ret); 1614 BUG_ON(ret);
1615
1616 /* if the nlink count is zero here, the iput
1617 * will free the inode. We bump it to make
1618 * sure it doesn't get freed until the link
1619 * count fixup is done
1620 */
1621 if (inode->i_nlink == 0) {
1622 btrfs_inc_nlink(inode);
1623 btrfs_update_inode(wc->trans,
1624 root, inode);
1625 }
1536 iput(inode); 1626 iput(inode);
1537 } 1627 }
1538 ret = link_to_fixup_dir(wc->trans, root, 1628 ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1840 return ret; 1930 return ret;
1841} 1931}
1842 1932
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1933static int wait_log_commit(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root, unsigned long transid)
1844{ 1935{
1845 DEFINE_WAIT(wait); 1936 DEFINE_WAIT(wait);
1846 int index = transid % 2; 1937 int index = transid % 2;
@@ -1854,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1854 prepare_to_wait(&root->log_commit_wait[index], 1945 prepare_to_wait(&root->log_commit_wait[index],
1855 &wait, TASK_UNINTERRUPTIBLE); 1946 &wait, TASK_UNINTERRUPTIBLE);
1856 mutex_unlock(&root->log_mutex); 1947 mutex_unlock(&root->log_mutex);
1857 if (root->log_transid < transid + 2 && 1948
1949 if (root->fs_info->last_trans_log_full_commit !=
1950 trans->transid && root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index])) 1951 atomic_read(&root->log_commit[index]))
1859 schedule(); 1952 schedule();
1953
1860 finish_wait(&root->log_commit_wait[index], &wait); 1954 finish_wait(&root->log_commit_wait[index], &wait);
1861 mutex_lock(&root->log_mutex); 1955 mutex_lock(&root->log_mutex);
1862 } while (root->log_transid < transid + 2 && 1956 } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 return 0; 1958 return 0;
1865} 1959}
1866 1960
1867static int wait_for_writer(struct btrfs_root *root) 1961static int wait_for_writer(struct btrfs_trans_handle *trans,
1962 struct btrfs_root *root)
1868{ 1963{
1869 DEFINE_WAIT(wait); 1964 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) { 1965 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait, 1966 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE); 1967 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex); 1968 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers)) 1969 if (root->fs_info->last_trans_log_full_commit !=
1970 trans->transid && atomic_read(&root->log_writers))
1875 schedule(); 1971 schedule();
1876 mutex_lock(&root->log_mutex); 1972 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait); 1973 finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
1882/* 1978/*
1883 * btrfs_sync_log does sends a given tree log down to the disk and 1979 * btrfs_sync_log does sends a given tree log down to the disk and
1884 * updates the super blocks to record it. When this call is done, 1980 * updates the super blocks to record it. When this call is done,
1885 * you know that any inodes previously logged are safely on disk 1981 * you know that any inodes previously logged are safely on disk only
1982 * if it returns 0.
1983 *
1984 * Any other return value means you need to call btrfs_commit_transaction.
1985 * Some of the edge cases for fsyncing directories that have had unlinks
1986 * or renames done in the past mean that sometimes the only safe
1987 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1988 * that has happened.
1886 */ 1989 */
1887int btrfs_sync_log(struct btrfs_trans_handle *trans, 1990int btrfs_sync_log(struct btrfs_trans_handle *trans,
1888 struct btrfs_root *root) 1991 struct btrfs_root *root)
@@ -1896,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1896 mutex_lock(&root->log_mutex); 1999 mutex_lock(&root->log_mutex);
1897 index1 = root->log_transid % 2; 2000 index1 = root->log_transid % 2;
1898 if (atomic_read(&root->log_commit[index1])) { 2001 if (atomic_read(&root->log_commit[index1])) {
1899 wait_log_commit(root, root->log_transid); 2002 wait_log_commit(trans, root, root->log_transid);
1900 mutex_unlock(&root->log_mutex); 2003 mutex_unlock(&root->log_mutex);
1901 return 0; 2004 return 0;
1902 } 2005 }
@@ -1904,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1904 2007
1905 /* wait for previous tree log sync to complete */ 2008 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2009 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1); 2010 wait_log_commit(trans, root, root->log_transid - 1);
1908 2011
1909 while (1) { 2012 while (1) {
1910 unsigned long batch = root->log_batch; 2013 unsigned long batch = root->log_batch;
1911 mutex_unlock(&root->log_mutex); 2014 mutex_unlock(&root->log_mutex);
1912 schedule_timeout_uninterruptible(1); 2015 schedule_timeout_uninterruptible(1);
1913 mutex_lock(&root->log_mutex); 2016 mutex_lock(&root->log_mutex);
1914 wait_for_writer(root); 2017
2018 wait_for_writer(trans, root);
1915 if (batch == root->log_batch) 2019 if (batch == root->log_batch)
1916 break; 2020 break;
1917 } 2021 }
1918 2022
2023 /* bail out if we need to do a full commit */
2024 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2025 ret = -EAGAIN;
2026 mutex_unlock(&root->log_mutex);
2027 goto out;
2028 }
2029
1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2030 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1920 BUG_ON(ret); 2031 BUG_ON(ret);
1921 2032
@@ -1951,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1951 2062
1952 index2 = log_root_tree->log_transid % 2; 2063 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) { 2064 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2065 wait_log_commit(trans, log_root_tree,
2066 log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex); 2067 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out; 2068 goto out;
1957 } 2069 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1); 2070 atomic_set(&log_root_tree->log_commit[index2], 1);
1959 2071
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2072 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2073 wait_log_commit(trans, log_root_tree,
2074 log_root_tree->log_transid - 1);
2075 }
2076
2077 wait_for_writer(trans, log_root_tree);
1962 2078
1963 wait_for_writer(log_root_tree); 2079 /*
2080 * now that we've moved on to the tree of log tree roots,
2081 * check the full commit flag again
2082 */
2083 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2084 mutex_unlock(&log_root_tree->log_mutex);
2085 ret = -EAGAIN;
2086 goto out_wake_log_root;
2087 }
1964 2088
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2089 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages); 2090 &log_root_tree->dirty_log_pages);
@@ -1985,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1985 * in and cause problems either. 2109 * in and cause problems either.
1986 */ 2110 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2); 2111 write_ctree_super(trans, root->fs_info->tree_root, 2);
2112 ret = 0;
1988 2113
2114out_wake_log_root:
1989 atomic_set(&log_root_tree->log_commit[index2], 0); 2115 atomic_set(&log_root_tree->log_commit[index2], 0);
1990 smp_mb(); 2116 smp_mb();
1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2117 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2124,8 @@ out:
1998 return 0; 2124 return 0;
1999} 2125}
2000 2126
2001/* * free all the extents used by the tree log. This should be called 2127/*
2128 * free all the extents used by the tree log. This should be called
2002 * at commit time of the full transaction 2129 * at commit time of the full transaction
2003 */ 2130 */
2004int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2131int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2132 2259
2133 btrfs_free_path(path); 2260 btrfs_free_path(path);
2134 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2261 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2135 end_log_trans(root); 2262 btrfs_end_log_trans(root);
2136 2263
2137 return 0; 2264 return 0;
2138} 2265}
@@ -2159,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2159 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2286 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2160 dirid, &index); 2287 dirid, &index);
2161 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2288 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2162 end_log_trans(root); 2289 btrfs_end_log_trans(root);
2163 2290
2164 return ret; 2291 return ret;
2165} 2292}
@@ -2559,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2559 * 2686 *
2560 * This handles both files and directories. 2687 * This handles both files and directories.
2561 */ 2688 */
2562static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2689static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root, struct inode *inode, 2690 struct btrfs_root *root, struct inode *inode,
2564 int inode_only) 2691 int inode_only)
2565{ 2692{
@@ -2585,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2585 min_key.offset = 0; 2712 min_key.offset = 0;
2586 2713
2587 max_key.objectid = inode->i_ino; 2714 max_key.objectid = inode->i_ino;
2715
2716 /* today the code can only do partial logging of directories */
2717 if (!S_ISDIR(inode->i_mode))
2718 inode_only = LOG_INODE_ALL;
2719
2588 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2720 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2589 max_key.type = BTRFS_XATTR_ITEM_KEY; 2721 max_key.type = BTRFS_XATTR_ITEM_KEY;
2590 else 2722 else
2591 max_key.type = (u8)-1; 2723 max_key.type = (u8)-1;
2592 max_key.offset = (u64)-1; 2724 max_key.offset = (u64)-1;
2593 2725
2594 /*
2595 * if this inode has already been logged and we're in inode_only
2596 * mode, we don't want to delete the things that have already
2597 * been written to the log.
2598 *
2599 * But, if the inode has been through an inode_only log,
2600 * the logged_trans field is not set. This allows us to catch
2601 * any new names for this inode in the backrefs by logging it
2602 * again
2603 */
2604 if (inode_only == LOG_INODE_EXISTS &&
2605 BTRFS_I(inode)->logged_trans == trans->transid) {
2606 btrfs_free_path(path);
2607 btrfs_free_path(dst_path);
2608 goto out;
2609 }
2610 mutex_lock(&BTRFS_I(inode)->log_mutex); 2726 mutex_lock(&BTRFS_I(inode)->log_mutex);
2611 2727
2612 /* 2728 /*
@@ -2693,7 +2809,6 @@ next_slot:
2693 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2809 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2694 btrfs_release_path(root, path); 2810 btrfs_release_path(root, path);
2695 btrfs_release_path(log, dst_path); 2811 btrfs_release_path(log, dst_path);
2696 BTRFS_I(inode)->log_dirty_trans = 0;
2697 ret = log_directory_changes(trans, root, inode, path, dst_path); 2812 ret = log_directory_changes(trans, root, inode, path, dst_path);
2698 BUG_ON(ret); 2813 BUG_ON(ret);
2699 } 2814 }
@@ -2702,19 +2817,69 @@ next_slot:
2702 2817
2703 btrfs_free_path(path); 2818 btrfs_free_path(path);
2704 btrfs_free_path(dst_path); 2819 btrfs_free_path(dst_path);
2705out:
2706 return 0; 2820 return 0;
2707} 2821}
2708 2822
2709int btrfs_log_inode(struct btrfs_trans_handle *trans, 2823/*
2710 struct btrfs_root *root, struct inode *inode, 2824 * follow the dentry parent pointers up the chain and see if any
2711 int inode_only) 2825 * of the directories in it require a full commit before they can
2826 * be logged. Returns zero if nothing special needs to be done or 1 if
2827 * a full commit is required.
2828 */
2829static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2830 struct inode *inode,
2831 struct dentry *parent,
2832 struct super_block *sb,
2833 u64 last_committed)
2712{ 2834{
2713 int ret; 2835 int ret = 0;
2836 struct btrfs_root *root;
2714 2837
2715 start_log_trans(trans, root); 2838 /*
2716 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2839 * for regular files, if its inode is already on disk, we don't
2717 end_log_trans(root); 2840 * have to worry about the parents at all. This is because
2841 * we can use the last_unlink_trans field to record renames
2842 * and other fun in this file.
2843 */
2844 if (S_ISREG(inode->i_mode) &&
2845 BTRFS_I(inode)->generation <= last_committed &&
2846 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2847 goto out;
2848
2849 if (!S_ISDIR(inode->i_mode)) {
2850 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2851 goto out;
2852 inode = parent->d_inode;
2853 }
2854
2855 while (1) {
2856 BTRFS_I(inode)->logged_trans = trans->transid;
2857 smp_mb();
2858
2859 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2860 root = BTRFS_I(inode)->root;
2861
2862 /*
2863 * make sure any commits to the log are forced
2864 * to be full commits
2865 */
2866 root->fs_info->last_trans_log_full_commit =
2867 trans->transid;
2868 ret = 1;
2869 break;
2870 }
2871
2872 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2873 break;
2874
2875 if (parent == sb->s_root)
2876 break;
2877
2878 parent = parent->d_parent;
2879 inode = parent->d_inode;
2880
2881 }
2882out:
2718 return ret; 2883 return ret;
2719} 2884}
2720 2885
@@ -2724,31 +2889,65 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 * only logging is done of any parent directories that are older than 2889 * only logging is done of any parent directories that are older than
2725 * the last committed transaction 2890 * the last committed transaction
2726 */ 2891 */
2727int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2892int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2728 struct btrfs_root *root, struct dentry *dentry) 2893 struct btrfs_root *root, struct inode *inode,
2894 struct dentry *parent, int exists_only)
2729{ 2895{
2730 int inode_only = LOG_INODE_ALL; 2896 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2731 struct super_block *sb; 2897 struct super_block *sb;
2732 int ret; 2898 int ret = 0;
2899 u64 last_committed = root->fs_info->last_trans_committed;
2900
2901 sb = inode->i_sb;
2902
2903 if (root->fs_info->last_trans_log_full_commit >
2904 root->fs_info->last_trans_committed) {
2905 ret = 1;
2906 goto end_no_trans;
2907 }
2908
2909 ret = check_parent_dirs_for_sync(trans, inode, parent,
2910 sb, last_committed);
2911 if (ret)
2912 goto end_no_trans;
2733 2913
2734 start_log_trans(trans, root); 2914 start_log_trans(trans, root);
2735 sb = dentry->d_inode->i_sb;
2736 while (1) {
2737 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2738 inode_only);
2739 BUG_ON(ret);
2740 inode_only = LOG_INODE_EXISTS;
2741 2915
2742 dentry = dentry->d_parent; 2916 ret = btrfs_log_inode(trans, root, inode, inode_only);
2743 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2917 BUG_ON(ret);
2918
2919 /*
2920 * for regular files, if its inode is already on disk, we don't
2921 * have to worry about the parents at all. This is because
2922 * we can use the last_unlink_trans field to record renames
2923 * and other fun in this file.
2924 */
2925 if (S_ISREG(inode->i_mode) &&
2926 BTRFS_I(inode)->generation <= last_committed &&
2927 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2928 goto no_parent;
2929
2930 inode_only = LOG_INODE_EXISTS;
2931 while (1) {
2932 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2744 break; 2933 break;
2745 2934
2746 if (BTRFS_I(dentry->d_inode)->generation <= 2935 inode = parent->d_inode;
2747 root->fs_info->last_trans_committed) 2936 if (BTRFS_I(inode)->generation >
2937 root->fs_info->last_trans_committed) {
2938 ret = btrfs_log_inode(trans, root, inode, inode_only);
2939 BUG_ON(ret);
2940 }
2941 if (parent == sb->s_root)
2748 break; 2942 break;
2943
2944 parent = parent->d_parent;
2749 } 2945 }
2750 end_log_trans(root); 2946no_parent:
2751 return 0; 2947 ret = 0;
2948 btrfs_end_log_trans(root);
2949end_no_trans:
2950 return ret;
2752} 2951}
2753 2952
2754/* 2953/*
@@ -2760,12 +2959,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2760int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2959int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2761 struct btrfs_root *root, struct dentry *dentry) 2960 struct btrfs_root *root, struct dentry *dentry)
2762{ 2961{
2763 u64 gen; 2962 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2764 gen = root->fs_info->last_trans_new_blockgroup; 2963 dentry->d_parent, 0);
2765 if (gen > root->fs_info->last_trans_committed)
2766 return 1;
2767 else
2768 return btrfs_log_dentry(trans, root, dentry);
2769} 2964}
2770 2965
2771/* 2966/*
@@ -2884,3 +3079,94 @@ again:
2884 kfree(log_root_tree); 3079 kfree(log_root_tree);
2885 return 0; 3080 return 0;
2886} 3081}
3082
3083/*
3084 * there are some corner cases where we want to force a full
3085 * commit instead of allowing a directory to be logged.
3086 *
3087 * They revolve around files there were unlinked from the directory, and
3088 * this function updates the parent directory so that a full commit is
3089 * properly done if it is fsync'd later after the unlinks are done.
3090 */
3091void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3092 struct inode *dir, struct inode *inode,
3093 int for_rename)
3094{
3095 /*
3096 * when we're logging a file, if it hasn't been renamed
3097 * or unlinked, and its inode is fully committed on disk,
3098 * we don't have to worry about walking up the directory chain
3099 * to log its parents.
3100 *
3101 * So, we use the last_unlink_trans field to put this transid
3102 * into the file. When the file is logged we check it and
3103 * don't log the parents if the file is fully on disk.
3104 */
3105 if (S_ISREG(inode->i_mode))
3106 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3107
3108 /*
3109 * if this directory was already logged any new
3110 * names for this file/dir will get recorded
3111 */
3112 smp_mb();
3113 if (BTRFS_I(dir)->logged_trans == trans->transid)
3114 return;
3115
3116 /*
3117 * if the inode we're about to unlink was logged,
3118 * the log will be properly updated for any new names
3119 */
3120 if (BTRFS_I(inode)->logged_trans == trans->transid)
3121 return;
3122
3123 /*
3124 * when renaming files across directories, if the directory
3125 * there we're unlinking from gets fsync'd later on, there's
3126 * no way to find the destination directory later and fsync it
3127 * properly. So, we have to be conservative and force commits
3128 * so the new name gets discovered.
3129 */
3130 if (for_rename)
3131 goto record;
3132
3133 /* we can safely do the unlink without any special recording */
3134 return;
3135
3136record:
3137 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3138}
3139
3140/*
3141 * Call this after adding a new name for a file and it will properly
3142 * update the log to reflect the new name.
3143 *
3144 * It will return zero if all goes well, and it will return 1 if a
3145 * full transaction commit is required.
3146 */
3147int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3148 struct inode *inode, struct inode *old_dir,
3149 struct dentry *parent)
3150{
3151 struct btrfs_root * root = BTRFS_I(inode)->root;
3152
3153 /*
3154 * this will force the logging code to walk the dentry chain
3155 * up for the file
3156 */
3157 if (S_ISREG(inode->i_mode))
3158 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3159
3160 /*
3161 * if this inode hasn't been logged and directory we're renaming it
3162 * from hasn't been logged, we don't need to log it
3163 */
3164 if (BTRFS_I(inode)->logged_trans <=
3165 root->fs_info->last_trans_committed &&
3166 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3167 root->fs_info->last_trans_committed))
3168 return 0;
3169
3170 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3171}
3172
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed0..d09c7609e16 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root); 25int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 26int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry); 27 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 28int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 29 struct btrfs_root *root,
35 const char *name, int name_len, 30 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 33 struct btrfs_root *root,
39 const char *name, int name_len, 34 const char *name, int name_len,
40 struct inode *inode, u64 dirid); 35 struct inode *inode, u64 dirid);
36int btrfs_join_running_log_trans(struct btrfs_root *root);
37int btrfs_end_log_trans(struct btrfs_root *root);
38int btrfs_pin_log_trans(struct btrfs_root *root);
39int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root, struct inode *inode,
41 struct dentry *parent, int exists_only);
42void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
43 struct inode *dir, struct inode *inode,
44 int for_rename);
45int btrfs_log_new_name(struct btrfs_trans_handle *trans,
46 struct inode *inode, struct inode *old_dir,
47 struct dentry *parent);
41#endif 48#endif
diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97c..f5f8b15a6e4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -290,7 +290,7 @@ static void free_more_memory(void)
290 &zone); 290 &zone);
291 if (zone) 291 if (zone)
292 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, 292 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
293 GFP_NOFS); 293 GFP_NOFS, NULL);
294 } 294 }
295} 295}
296 296
@@ -547,6 +547,39 @@ repeat:
547 return err; 547 return err;
548} 548}
549 549
550void do_thaw_all(unsigned long unused)
551{
552 struct super_block *sb;
553 char b[BDEVNAME_SIZE];
554
555 spin_lock(&sb_lock);
556restart:
557 list_for_each_entry(sb, &super_blocks, s_list) {
558 sb->s_count++;
559 spin_unlock(&sb_lock);
560 down_read(&sb->s_umount);
561 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
562 printk(KERN_WARNING "Emergency Thaw on %s\n",
563 bdevname(sb->s_bdev, b));
564 up_read(&sb->s_umount);
565 spin_lock(&sb_lock);
566 if (__put_super_and_need_restart(sb))
567 goto restart;
568 }
569 spin_unlock(&sb_lock);
570 printk(KERN_WARNING "Emergency Thaw complete\n");
571}
572
573/**
574 * emergency_thaw_all -- forcibly thaw every frozen filesystem
575 *
576 * Used for emergency unfreeze of all filesystems via SysRq
577 */
578void emergency_thaw_all(void)
579{
580 pdflush_operation(do_thaw_all, 0);
581}
582
550/** 583/**
551 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers 584 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
552 * @mapping: the mapping which wants those buffers written 585 * @mapping: the mapping which wants those buffers written
@@ -621,14 +654,7 @@ static void __set_page_dirty(struct page *page,
621 spin_lock_irq(&mapping->tree_lock); 654 spin_lock_irq(&mapping->tree_lock);
622 if (page->mapping) { /* Race with truncate? */ 655 if (page->mapping) { /* Race with truncate? */
623 WARN_ON_ONCE(warn && !PageUptodate(page)); 656 WARN_ON_ONCE(warn && !PageUptodate(page));
624 657 account_page_dirtied(page, mapping);
625 if (mapping_cap_account_dirty(mapping)) {
626 __inc_zone_page_state(page, NR_FILE_DIRTY);
627 __inc_bdi_stat(mapping->backing_dev_info,
628 BDI_RECLAIMABLE);
629 task_dirty_inc(current);
630 task_io_account_write(PAGE_CACHE_SIZE);
631 }
632 radix_tree_tag_set(&mapping->page_tree, 658 radix_tree_tag_set(&mapping->page_tree,
633 page_index(page), PAGECACHE_TAG_DIRTY); 659 page_index(page), PAGECACHE_TAG_DIRTY);
634 } 660 }
@@ -2320,13 +2346,14 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
2320 * unlock the page. 2346 * unlock the page.
2321 */ 2347 */
2322int 2348int
2323block_page_mkwrite(struct vm_area_struct *vma, struct page *page, 2349block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2324 get_block_t get_block) 2350 get_block_t get_block)
2325{ 2351{
2352 struct page *page = vmf->page;
2326 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 2353 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2327 unsigned long end; 2354 unsigned long end;
2328 loff_t size; 2355 loff_t size;
2329 int ret = -EINVAL; 2356 int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
2330 2357
2331 lock_page(page); 2358 lock_page(page);
2332 size = i_size_read(inode); 2359 size = i_size_read(inode);
@@ -2346,6 +2373,13 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2346 if (!ret) 2373 if (!ret)
2347 ret = block_commit_write(page, 0, end); 2374 ret = block_commit_write(page, 0, end);
2348 2375
2376 if (unlikely(ret)) {
2377 if (ret == -ENOMEM)
2378 ret = VM_FAULT_OOM;
2379 else /* -ENOSPC, -EIO, etc */
2380 ret = VM_FAULT_SIGBUS;
2381 }
2382
2349out_unlock: 2383out_unlock:
2350 unlock_page(page); 2384 unlock_page(page);
2351 return ret; 2385 return ret;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e4a6223c314..af737bb56cb 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
740out_release_free_unlock: 740out_release_free_unlock:
741 crypto_free_hash(s->hash_desc.tfm); 741 crypto_free_hash(s->hash_desc.tfm);
742out_free_unlock: 742out_free_unlock:
743 memset(s->block_aligned_filename, 0, s->block_aligned_filename_size); 743 kzfree(s->block_aligned_filename);
744 kfree(s->block_aligned_filename);
745out_unlock: 744out_unlock:
746 mutex_unlock(s->tfm_mutex); 745 mutex_unlock(s->tfm_mutex);
747out: 746out:
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 96ef51489e0..295e7fa5675 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -291,8 +291,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
291 if (daemon->user_ns) 291 if (daemon->user_ns)
292 put_user_ns(daemon->user_ns); 292 put_user_ns(daemon->user_ns);
293 mutex_unlock(&daemon->mux); 293 mutex_unlock(&daemon->mux);
294 memset(daemon, 0, sizeof(*daemon)); 294 kzfree(daemon);
295 kfree(daemon);
296out: 295out:
297 return rc; 296 return rc;
298} 297}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa..2a701d593d3 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
28 * issue a wakeup. 28 * issue a wakeup.
29 */ 29 */
30 __u64 count; 30 __u64 count;
31 unsigned int flags;
31}; 32};
32 33
33/* 34/*
@@ -50,7 +51,7 @@ int eventfd_signal(struct file *file, int n)
50 n = (int) (ULLONG_MAX - ctx->count); 51 n = (int) (ULLONG_MAX - ctx->count);
51 ctx->count += n; 52 ctx->count += n;
52 if (waitqueue_active(&ctx->wqh)) 53 if (waitqueue_active(&ctx->wqh))
53 wake_up_locked(&ctx->wqh); 54 wake_up_locked_poll(&ctx->wqh, POLLIN);
54 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 55 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
55 56
56 return n; 57 return n;
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
87{ 88{
88 struct eventfd_ctx *ctx = file->private_data; 89 struct eventfd_ctx *ctx = file->private_data;
89 ssize_t res; 90 ssize_t res;
90 __u64 ucnt; 91 __u64 ucnt = 0;
91 DECLARE_WAITQUEUE(wait, current); 92 DECLARE_WAITQUEUE(wait, current);
92 93
93 if (count < sizeof(ucnt)) 94 if (count < sizeof(ucnt))
94 return -EINVAL; 95 return -EINVAL;
95 spin_lock_irq(&ctx->wqh.lock); 96 spin_lock_irq(&ctx->wqh.lock);
96 res = -EAGAIN; 97 res = -EAGAIN;
97 ucnt = ctx->count; 98 if (ctx->count > 0)
98 if (ucnt > 0)
99 res = sizeof(ucnt); 99 res = sizeof(ucnt);
100 else if (!(file->f_flags & O_NONBLOCK)) { 100 else if (!(file->f_flags & O_NONBLOCK)) {
101 __add_wait_queue(&ctx->wqh, &wait); 101 __add_wait_queue(&ctx->wqh, &wait);
102 for (res = 0;;) { 102 for (res = 0;;) {
103 set_current_state(TASK_INTERRUPTIBLE); 103 set_current_state(TASK_INTERRUPTIBLE);
104 if (ctx->count > 0) { 104 if (ctx->count > 0) {
105 ucnt = ctx->count;
106 res = sizeof(ucnt); 105 res = sizeof(ucnt);
107 break; 106 break;
108 } 107 }
@@ -117,10 +116,11 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
117 __remove_wait_queue(&ctx->wqh, &wait); 116 __remove_wait_queue(&ctx->wqh, &wait);
118 __set_current_state(TASK_RUNNING); 117 __set_current_state(TASK_RUNNING);
119 } 118 }
120 if (res > 0) { 119 if (likely(res > 0)) {
121 ctx->count = 0; 120 ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
121 ctx->count -= ucnt;
122 if (waitqueue_active(&ctx->wqh)) 122 if (waitqueue_active(&ctx->wqh))
123 wake_up_locked(&ctx->wqh); 123 wake_up_locked_poll(&ctx->wqh, POLLOUT);
124 } 124 }
125 spin_unlock_irq(&ctx->wqh.lock); 125 spin_unlock_irq(&ctx->wqh.lock);
126 if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) 126 if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
@@ -166,10 +166,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
166 __remove_wait_queue(&ctx->wqh, &wait); 166 __remove_wait_queue(&ctx->wqh, &wait);
167 __set_current_state(TASK_RUNNING); 167 __set_current_state(TASK_RUNNING);
168 } 168 }
169 if (res > 0) { 169 if (likely(res > 0)) {
170 ctx->count += ucnt; 170 ctx->count += ucnt;
171 if (waitqueue_active(&ctx->wqh)) 171 if (waitqueue_active(&ctx->wqh))
172 wake_up_locked(&ctx->wqh); 172 wake_up_locked_poll(&ctx->wqh, POLLIN);
173 } 173 }
174 spin_unlock_irq(&ctx->wqh.lock); 174 spin_unlock_irq(&ctx->wqh.lock);
175 175
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
207 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); 207 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
208 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 208 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
209 209
210 if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK)) 210 if (flags & ~EFD_FLAGS_SET)
211 return -EINVAL; 211 return -EINVAL;
212 212
213 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 213 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
216 216
217 init_waitqueue_head(&ctx->wqh); 217 init_waitqueue_head(&ctx->wqh);
218 ctx->count = count; 218 ctx->count = count;
219 ctx->flags = flags;
219 220
220 /* 221 /*
221 * When we call this, the initialization must be complete, since 222 * When we call this, the initialization must be complete, since
222 * anon_inode_getfd() will install the fd. 223 * anon_inode_getfd() will install the fd.
223 */ 224 */
224 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 225 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
225 flags & (O_CLOEXEC | O_NONBLOCK)); 226 flags & EFD_SHARED_FCNTL_FLAGS);
226 if (fd < 0) 227 if (fd < 0)
227 kfree(ctx); 228 kfree(ctx);
228 return fd; 229 return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
232{ 233{
233 return sys_eventfd2(count, 0); 234 return sys_eventfd2(count, 0);
234} 235}
236
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c5c424f23fd..a89f370fadb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * fs/eventpoll.c (Efficent event polling implementation) 2 * fs/eventpoll.c (Efficient event retrieval implementation)
3 * Copyright (C) 2001,...,2007 Davide Libenzi 3 * Copyright (C) 2001,...,2009 Davide Libenzi
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -71,29 +71,11 @@
71 * a better scalability. 71 * a better scalability.
72 */ 72 */
73 73
74#define DEBUG_EPOLL 0
75
76#if DEBUG_EPOLL > 0
77#define DPRINTK(x) printk x
78#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
79#else /* #if DEBUG_EPOLL > 0 */
80#define DPRINTK(x) (void) 0
81#define DNPRINTK(n, x) (void) 0
82#endif /* #if DEBUG_EPOLL > 0 */
83
84#define DEBUG_EPI 0
85
86#if DEBUG_EPI != 0
87#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
88#else /* #if DEBUG_EPI != 0 */
89#define EPI_SLAB_DEBUG 0
90#endif /* #if DEBUG_EPI != 0 */
91
92/* Epoll private bits inside the event mask */ 74/* Epoll private bits inside the event mask */
93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) 75#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
94 76
95/* Maximum number of poll wake up nests we are allowing */ 77/* Maximum number of nesting allowed inside epoll sets */
96#define EP_MAX_POLLWAKE_NESTS 4 78#define EP_MAX_NESTS 4
97 79
98/* Maximum msec timeout value storeable in a long int */ 80/* Maximum msec timeout value storeable in a long int */
99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) 81#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +92,21 @@ struct epoll_filefd {
110}; 92};
111 93
112/* 94/*
113 * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". 95 * Structure used to track possible nested calls, for too deep recursions
114 * It is used to keep track on all tasks that are currently inside the wake_up() code 96 * and loop cycles.
115 * to 1) short-circuit the one coming from the same task and same wait queue head
116 * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
117 * 3) let go the ones coming from other tasks.
118 */ 97 */
119struct wake_task_node { 98struct nested_call_node {
120 struct list_head llink; 99 struct list_head llink;
121 struct task_struct *task; 100 void *cookie;
122 wait_queue_head_t *wq; 101 int cpu;
123}; 102};
124 103
125/* 104/*
126 * This is used to implement the safe poll wake up avoiding to reenter 105 * This structure is used as collector for nested calls, to check for
127 * the poll callback from inside wake_up(). 106 * maximum recursion dept and loop cycles.
128 */ 107 */
129struct poll_safewake { 108struct nested_calls {
130 struct list_head wake_task_list; 109 struct list_head tasks_call_list;
131 spinlock_t lock; 110 spinlock_t lock;
132}; 111};
133 112
@@ -213,7 +192,7 @@ struct eppoll_entry {
213 struct list_head llink; 192 struct list_head llink;
214 193
215 /* The "base" pointer is set to the container "struct epitem" */ 194 /* The "base" pointer is set to the container "struct epitem" */
216 void *base; 195 struct epitem *base;
217 196
218 /* 197 /*
219 * Wait queue item that will be linked to the target file wait 198 * Wait queue item that will be linked to the target file wait
@@ -231,6 +210,12 @@ struct ep_pqueue {
231 struct epitem *epi; 210 struct epitem *epi;
232}; 211};
233 212
213/* Used by the ep_send_events() function as callback private data */
214struct ep_send_events_data {
215 int maxevents;
216 struct epoll_event __user *events;
217};
218
234/* 219/*
235 * Configuration options available inside /proc/sys/fs/epoll/ 220 * Configuration options available inside /proc/sys/fs/epoll/
236 */ 221 */
@@ -242,8 +227,11 @@ static int max_user_watches __read_mostly;
242 */ 227 */
243static DEFINE_MUTEX(epmutex); 228static DEFINE_MUTEX(epmutex);
244 229
245/* Safe wake up implementation */ 230/* Used for safe wake up implementation */
246static struct poll_safewake psw; 231static struct nested_calls poll_safewake_ncalls;
232
233/* Used to call file's f_op->poll() under the nested calls boundaries */
234static struct nested_calls poll_readywalk_ncalls;
247 235
248/* Slab cache used to allocate "struct epitem" */ 236/* Slab cache used to allocate "struct epitem" */
249static struct kmem_cache *epi_cache __read_mostly; 237static struct kmem_cache *epi_cache __read_mostly;
@@ -312,89 +300,230 @@ static inline int ep_op_has_event(int op)
312} 300}
313 301
314/* Initialize the poll safe wake up structure */ 302/* Initialize the poll safe wake up structure */
315static void ep_poll_safewake_init(struct poll_safewake *psw) 303static void ep_nested_calls_init(struct nested_calls *ncalls)
316{ 304{
317 305 INIT_LIST_HEAD(&ncalls->tasks_call_list);
318 INIT_LIST_HEAD(&psw->wake_task_list); 306 spin_lock_init(&ncalls->lock);
319 spin_lock_init(&psw->lock);
320} 307}
321 308
322/* 309/**
323 * Perform a safe wake up of the poll wait list. The problem is that 310 * ep_call_nested - Perform a bound (possibly) nested call, by checking
324 * with the new callback'd wake up system, it is possible that the 311 * that the recursion limit is not exceeded, and that
325 * poll callback is reentered from inside the call to wake_up() done 312 * the same nested call (by the meaning of same cookie) is
326 * on the poll wait queue head. The rule is that we cannot reenter the 313 * no re-entered.
327 * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, 314 *
328 * and we cannot reenter the same wait queue head at all. This will 315 * @ncalls: Pointer to the nested_calls structure to be used for this call.
329 * enable to have a hierarchy of epoll file descriptor of no more than 316 * @max_nests: Maximum number of allowed nesting calls.
330 * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock 317 * @nproc: Nested call core function pointer.
331 * because this one gets called by the poll callback, that in turn is called 318 * @priv: Opaque data to be passed to the @nproc callback.
332 * from inside a wake_up(), that might be called from irq context. 319 * @cookie: Cookie to be used to identify this nested call.
320 *
321 * Returns: Returns the code returned by the @nproc callback, or -1 if
322 * the maximum recursion limit has been exceeded.
333 */ 323 */
334static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) 324static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
325 int (*nproc)(void *, void *, int), void *priv,
326 void *cookie)
335{ 327{
336 int wake_nests = 0; 328 int error, call_nests = 0;
337 unsigned long flags; 329 unsigned long flags;
338 struct task_struct *this_task = current; 330 int this_cpu = get_cpu();
339 struct list_head *lsthead = &psw->wake_task_list; 331 struct list_head *lsthead = &ncalls->tasks_call_list;
340 struct wake_task_node *tncur; 332 struct nested_call_node *tncur;
341 struct wake_task_node tnode; 333 struct nested_call_node tnode;
342 334
343 spin_lock_irqsave(&psw->lock, flags); 335 spin_lock_irqsave(&ncalls->lock, flags);
344 336
345 /* Try to see if the current task is already inside this wakeup call */ 337 /*
338 * Try to see if the current task is already inside this wakeup call.
339 * We use a list here, since the population inside this set is always
340 * very much limited.
341 */
346 list_for_each_entry(tncur, lsthead, llink) { 342 list_for_each_entry(tncur, lsthead, llink) {
347 343 if (tncur->cpu == this_cpu &&
348 if (tncur->wq == wq || 344 (tncur->cookie == cookie || ++call_nests > max_nests)) {
349 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
350 /* 345 /*
351 * Ops ... loop detected or maximum nest level reached. 346 * Ops ... loop detected or maximum nest level reached.
352 * We abort this wake by breaking the cycle itself. 347 * We abort this wake by breaking the cycle itself.
353 */ 348 */
354 spin_unlock_irqrestore(&psw->lock, flags); 349 error = -1;
355 return; 350 goto out_unlock;
356 } 351 }
357 } 352 }
358 353
359 /* Add the current task to the list */ 354 /* Add the current task and cookie to the list */
360 tnode.task = this_task; 355 tnode.cpu = this_cpu;
361 tnode.wq = wq; 356 tnode.cookie = cookie;
362 list_add(&tnode.llink, lsthead); 357 list_add(&tnode.llink, lsthead);
363 358
364 spin_unlock_irqrestore(&psw->lock, flags); 359 spin_unlock_irqrestore(&ncalls->lock, flags);
365 360
366 /* Do really wake up now */ 361 /* Call the nested function */
367 wake_up_nested(wq, 1 + wake_nests); 362 error = (*nproc)(priv, cookie, call_nests);
368 363
369 /* Remove the current task from the list */ 364 /* Remove the current task from the list */
370 spin_lock_irqsave(&psw->lock, flags); 365 spin_lock_irqsave(&ncalls->lock, flags);
371 list_del(&tnode.llink); 366 list_del(&tnode.llink);
372 spin_unlock_irqrestore(&psw->lock, flags); 367 out_unlock:
368 spin_unlock_irqrestore(&ncalls->lock, flags);
369
370 put_cpu();
371 return error;
372}
373
374#ifdef CONFIG_DEBUG_LOCK_ALLOC
375static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
376 unsigned long events, int subclass)
377{
378 unsigned long flags;
379
380 spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
381 wake_up_locked_poll(wqueue, events);
382 spin_unlock_irqrestore(&wqueue->lock, flags);
383}
384#else
385static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
386 unsigned long events, int subclass)
387{
388 wake_up_poll(wqueue, events);
389}
390#endif
391
392static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
393{
394 ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
395 1 + call_nests);
396 return 0;
397}
398
399/*
400 * Perform a safe wake up of the poll wait list. The problem is that
401 * with the new callback'd wake up system, it is possible that the
402 * poll callback is reentered from inside the call to wake_up() done
403 * on the poll wait queue head. The rule is that we cannot reenter the
404 * wake up code from the same task more than EP_MAX_NESTS times,
405 * and we cannot reenter the same wait queue head at all. This will
406 * enable to have a hierarchy of epoll file descriptor of no more than
407 * EP_MAX_NESTS deep.
408 */
409static void ep_poll_safewake(wait_queue_head_t *wq)
410{
411 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
412 ep_poll_wakeup_proc, NULL, wq);
373} 413}
374 414
375/* 415/*
376 * This function unregister poll callbacks from the associated file descriptor. 416 * This function unregisters poll callbacks from the associated file
377 * Since this must be called without holding "ep->lock" the atomic exchange trick 417 * descriptor. Must be called with "mtx" held (or "epmutex" if called from
378 * will protect us from multiple unregister. 418 * ep_free).
379 */ 419 */
380static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 420static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
381{ 421{
382 int nwait;
383 struct list_head *lsthead = &epi->pwqlist; 422 struct list_head *lsthead = &epi->pwqlist;
384 struct eppoll_entry *pwq; 423 struct eppoll_entry *pwq;
385 424
386 /* This is called without locks, so we need the atomic exchange */ 425 while (!list_empty(lsthead)) {
387 nwait = xchg(&epi->nwait, 0); 426 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
388 427
389 if (nwait) { 428 list_del(&pwq->llink);
390 while (!list_empty(lsthead)) { 429 remove_wait_queue(pwq->whead, &pwq->wait);
391 pwq = list_first_entry(lsthead, struct eppoll_entry, llink); 430 kmem_cache_free(pwq_cache, pwq);
431 }
432}
392 433
393 list_del_init(&pwq->llink); 434/**
394 remove_wait_queue(pwq->whead, &pwq->wait); 435 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
395 kmem_cache_free(pwq_cache, pwq); 436 * the scan code, to call f_op->poll(). Also allows for
396 } 437 * O(NumReady) performance.
438 *
439 * @ep: Pointer to the epoll private data structure.
440 * @sproc: Pointer to the scan callback.
441 * @priv: Private opaque data passed to the @sproc callback.
442 *
443 * Returns: The same integer error code returned by the @sproc callback.
444 */
445static int ep_scan_ready_list(struct eventpoll *ep,
446 int (*sproc)(struct eventpoll *,
447 struct list_head *, void *),
448 void *priv)
449{
450 int error, pwake = 0;
451 unsigned long flags;
452 struct epitem *epi, *nepi;
453 LIST_HEAD(txlist);
454
455 /*
456 * We need to lock this because we could be hit by
457 * eventpoll_release_file() and epoll_ctl().
458 */
459 mutex_lock(&ep->mtx);
460
461 /*
462 * Steal the ready list, and re-init the original one to the
463 * empty list. Also, set ep->ovflist to NULL so that events
464 * happening while looping w/out locks, are not lost. We cannot
465 * have the poll callback to queue directly on ep->rdllist,
466 * because we want the "sproc" callback to be able to do it
467 * in a lockless way.
468 */
469 spin_lock_irqsave(&ep->lock, flags);
470 list_splice_init(&ep->rdllist, &txlist);
471 ep->ovflist = NULL;
472 spin_unlock_irqrestore(&ep->lock, flags);
473
474 /*
475 * Now call the callback function.
476 */
477 error = (*sproc)(ep, &txlist, priv);
478
479 spin_lock_irqsave(&ep->lock, flags);
480 /*
481 * During the time we spent inside the "sproc" callback, some
482 * other events might have been queued by the poll callback.
483 * We re-insert them inside the main ready-list here.
484 */
485 for (nepi = ep->ovflist; (epi = nepi) != NULL;
486 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
487 /*
488 * We need to check if the item is already in the list.
489 * During the "sproc" callback execution time, items are
490 * queued into ->ovflist but the "txlist" might already
491 * contain them, and the list_splice() below takes care of them.
492 */
493 if (!ep_is_linked(&epi->rdllink))
494 list_add_tail(&epi->rdllink, &ep->rdllist);
495 }
496 /*
497 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
498 * releasing the lock, events will be queued in the normal way inside
499 * ep->rdllist.
500 */
501 ep->ovflist = EP_UNACTIVE_PTR;
502
503 /*
504 * Quickly re-inject items left on "txlist".
505 */
506 list_splice(&txlist, &ep->rdllist);
507
508 if (!list_empty(&ep->rdllist)) {
509 /*
510 * Wake up (if active) both the eventpoll wait list and
511 * the ->poll() wait list (delayed after we release the lock).
512 */
513 if (waitqueue_active(&ep->wq))
514 wake_up_locked(&ep->wq);
515 if (waitqueue_active(&ep->poll_wait))
516 pwake++;
397 } 517 }
518 spin_unlock_irqrestore(&ep->lock, flags);
519
520 mutex_unlock(&ep->mtx);
521
522 /* We have to call this outside the lock */
523 if (pwake)
524 ep_poll_safewake(&ep->poll_wait);
525
526 return error;
398} 527}
399 528
400/* 529/*
@@ -434,9 +563,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
434 563
435 atomic_dec(&ep->user->epoll_watches); 564 atomic_dec(&ep->user->epoll_watches);
436 565
437 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
438 current, ep, file));
439
440 return 0; 566 return 0;
441} 567}
442 568
@@ -447,7 +573,7 @@ static void ep_free(struct eventpoll *ep)
447 573
448 /* We need to release all tasks waiting for these file */ 574 /* We need to release all tasks waiting for these file */
449 if (waitqueue_active(&ep->poll_wait)) 575 if (waitqueue_active(&ep->poll_wait))
450 ep_poll_safewake(&psw, &ep->poll_wait); 576 ep_poll_safewake(&ep->poll_wait);
451 577
452 /* 578 /*
453 * We need to lock this because we could be hit by 579 * We need to lock this because we could be hit by
@@ -492,26 +618,54 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
492 if (ep) 618 if (ep)
493 ep_free(ep); 619 ep_free(ep);
494 620
495 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
496 return 0; 621 return 0;
497} 622}
498 623
624static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
625 void *priv)
626{
627 struct epitem *epi, *tmp;
628
629 list_for_each_entry_safe(epi, tmp, head, rdllink) {
630 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
631 epi->event.events)
632 return POLLIN | POLLRDNORM;
633 else {
634 /*
635 * Item has been dropped into the ready list by the poll
636 * callback, but it's not actually ready, as far as
637 * caller requested events goes. We can remove it here.
638 */
639 list_del_init(&epi->rdllink);
640 }
641 }
642
643 return 0;
644}
645
646static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
647{
648 return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
649}
650
499static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 651static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
500{ 652{
501 unsigned int pollflags = 0; 653 int pollflags;
502 unsigned long flags;
503 struct eventpoll *ep = file->private_data; 654 struct eventpoll *ep = file->private_data;
504 655
505 /* Insert inside our poll wait queue */ 656 /* Insert inside our poll wait queue */
506 poll_wait(file, &ep->poll_wait, wait); 657 poll_wait(file, &ep->poll_wait, wait);
507 658
508 /* Check our condition */ 659 /*
509 spin_lock_irqsave(&ep->lock, flags); 660 * Proceed to find out if wanted events are really available inside
510 if (!list_empty(&ep->rdllist)) 661 * the ready list. This need to be done under ep_call_nested()
511 pollflags = POLLIN | POLLRDNORM; 662 * supervision, since the call to f_op->poll() done on listed files
512 spin_unlock_irqrestore(&ep->lock, flags); 663 * could re-enter here.
664 */
665 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
666 ep_poll_readyevents_proc, ep, ep);
513 667
514 return pollflags; 668 return pollflags != -1 ? pollflags : 0;
515} 669}
516 670
517/* File callbacks that implement the eventpoll file behaviour */ 671/* File callbacks that implement the eventpoll file behaviour */
@@ -541,7 +695,7 @@ void eventpoll_release_file(struct file *file)
541 * We don't want to get "file->f_lock" because it is not 695 * We don't want to get "file->f_lock" because it is not
542 * necessary. It is not necessary because we're in the "struct file" 696 * necessary. It is not necessary because we're in the "struct file"
543 * cleanup path, and this means that noone is using this file anymore. 697 * cleanup path, and this means that noone is using this file anymore.
544 * So, for example, epoll_ctl() cannot hit here sicne if we reach this 698 * So, for example, epoll_ctl() cannot hit here since if we reach this
545 * point, the file counter already went to zero and fget() would fail. 699 * point, the file counter already went to zero and fget() would fail.
546 * The only hit might come from ep_free() but by holding the mutex 700 * The only hit might come from ep_free() but by holding the mutex
547 * will correctly serialize the operation. We do need to acquire 701 * will correctly serialize the operation. We do need to acquire
@@ -588,8 +742,6 @@ static int ep_alloc(struct eventpoll **pep)
588 742
589 *pep = ep; 743 *pep = ep;
590 744
591 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
592 current, ep));
593 return 0; 745 return 0;
594 746
595free_uid: 747free_uid:
@@ -623,9 +775,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
623 } 775 }
624 } 776 }
625 777
626 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
627 current, file, epir));
628
629 return epir; 778 return epir;
630} 779}
631 780
@@ -641,9 +790,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
641 struct epitem *epi = ep_item_from_wait(wait); 790 struct epitem *epi = ep_item_from_wait(wait);
642 struct eventpoll *ep = epi->ep; 791 struct eventpoll *ep = epi->ep;
643 792
644 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
645 current, epi->ffd.file, epi, ep));
646
647 spin_lock_irqsave(&ep->lock, flags); 793 spin_lock_irqsave(&ep->lock, flags);
648 794
649 /* 795 /*
@@ -656,6 +802,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
656 goto out_unlock; 802 goto out_unlock;
657 803
658 /* 804 /*
805 * Check the events coming with the callback. At this stage, not
806 * every device reports the events in the "key" parameter of the
807 * callback. We need to be able to handle both cases here, hence the
808 * test for "key" != NULL before the event match test.
809 */
810 if (key && !((unsigned long) key & epi->event.events))
811 goto out_unlock;
812
813 /*
659 * If we are trasfering events to userspace, we can hold no locks 814 * If we are trasfering events to userspace, we can hold no locks
660 * (because we're accessing user memory, and because of linux f_op->poll() 815 * (because we're accessing user memory, and because of linux f_op->poll()
661 * semantics). All the events that happens during that period of time are 816 * semantics). All the events that happens during that period of time are
@@ -670,12 +825,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
670 } 825 }
671 826
672 /* If this file is already in the ready list we exit soon */ 827 /* If this file is already in the ready list we exit soon */
673 if (ep_is_linked(&epi->rdllink)) 828 if (!ep_is_linked(&epi->rdllink))
674 goto is_linked; 829 list_add_tail(&epi->rdllink, &ep->rdllist);
675
676 list_add_tail(&epi->rdllink, &ep->rdllist);
677 830
678is_linked:
679 /* 831 /*
680 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 832 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
681 * wait list. 833 * wait list.
@@ -690,7 +842,7 @@ out_unlock:
690 842
691 /* We have to call this outside the lock */ 843 /* We have to call this outside the lock */
692 if (pwake) 844 if (pwake)
693 ep_poll_safewake(&psw, &ep->poll_wait); 845 ep_poll_safewake(&ep->poll_wait);
694 846
695 return 1; 847 return 1;
696} 848}
@@ -817,10 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
817 969
818 /* We have to call this outside the lock */ 970 /* We have to call this outside the lock */
819 if (pwake) 971 if (pwake)
820 ep_poll_safewake(&psw, &ep->poll_wait); 972 ep_poll_safewake(&ep->poll_wait);
821
822 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
823 current, ep, tfile, fd));
824 973
825 return 0; 974 return 0;
826 975
@@ -851,15 +1000,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
851{ 1000{
852 int pwake = 0; 1001 int pwake = 0;
853 unsigned int revents; 1002 unsigned int revents;
854 unsigned long flags;
855 1003
856 /* 1004 /*
857 * Set the new event interest mask before calling f_op->poll(), otherwise 1005 * Set the new event interest mask before calling f_op->poll();
858 * a potential race might occur. In fact if we do this operation inside 1006 * otherwise we might miss an event that happens between the
859 * the lock, an event might happen between the f_op->poll() call and the 1007 * f_op->poll() call and the new event set registering.
860 * new event set registering.
861 */ 1008 */
862 epi->event.events = event->events; 1009 epi->event.events = event->events;
1010 epi->event.data = event->data; /* protected by mtx */
863 1011
864 /* 1012 /*
865 * Get current event bits. We can safely use the file* here because 1013 * Get current event bits. We can safely use the file* here because
@@ -867,16 +1015,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
867 */ 1015 */
868 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1016 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
869 1017
870 spin_lock_irqsave(&ep->lock, flags);
871
872 /* Copy the data member from inside the lock */
873 epi->event.data = event->data;
874
875 /* 1018 /*
876 * If the item is "hot" and it is not registered inside the ready 1019 * If the item is "hot" and it is not registered inside the ready
877 * list, push it inside. 1020 * list, push it inside.
878 */ 1021 */
879 if (revents & event->events) { 1022 if (revents & event->events) {
1023 spin_lock_irq(&ep->lock);
880 if (!ep_is_linked(&epi->rdllink)) { 1024 if (!ep_is_linked(&epi->rdllink)) {
881 list_add_tail(&epi->rdllink, &ep->rdllist); 1025 list_add_tail(&epi->rdllink, &ep->rdllist);
882 1026
@@ -886,142 +1030,84 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
886 if (waitqueue_active(&ep->poll_wait)) 1030 if (waitqueue_active(&ep->poll_wait))
887 pwake++; 1031 pwake++;
888 } 1032 }
1033 spin_unlock_irq(&ep->lock);
889 } 1034 }
890 spin_unlock_irqrestore(&ep->lock, flags);
891 1035
892 /* We have to call this outside the lock */ 1036 /* We have to call this outside the lock */
893 if (pwake) 1037 if (pwake)
894 ep_poll_safewake(&psw, &ep->poll_wait); 1038 ep_poll_safewake(&ep->poll_wait);
895 1039
896 return 0; 1040 return 0;
897} 1041}
898 1042
899static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, 1043static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
900 int maxevents) 1044 void *priv)
901{ 1045{
902 int eventcnt, error = -EFAULT, pwake = 0; 1046 struct ep_send_events_data *esed = priv;
1047 int eventcnt;
903 unsigned int revents; 1048 unsigned int revents;
904 unsigned long flags; 1049 struct epitem *epi;
905 struct epitem *epi, *nepi; 1050 struct epoll_event __user *uevent;
906 struct list_head txlist;
907
908 INIT_LIST_HEAD(&txlist);
909
910 /*
911 * We need to lock this because we could be hit by
912 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
913 */
914 mutex_lock(&ep->mtx);
915
916 /*
917 * Steal the ready list, and re-init the original one to the
918 * empty list. Also, set ep->ovflist to NULL so that events
919 * happening while looping w/out locks, are not lost. We cannot
920 * have the poll callback to queue directly on ep->rdllist,
921 * because we are doing it in the loop below, in a lockless way.
922 */
923 spin_lock_irqsave(&ep->lock, flags);
924 list_splice(&ep->rdllist, &txlist);
925 INIT_LIST_HEAD(&ep->rdllist);
926 ep->ovflist = NULL;
927 spin_unlock_irqrestore(&ep->lock, flags);
928 1051
929 /* 1052 /*
930 * We can loop without lock because this is a task private list. 1053 * We can loop without lock because we are passed a task private list.
931 * We just splice'd out the ep->rdllist in ep_collect_ready_items(). 1054 * Items cannot vanish during the loop because ep_scan_ready_list() is
932 * Items cannot vanish during the loop because we are holding "mtx". 1055 * holding "mtx" during this call.
933 */ 1056 */
934 for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { 1057 for (eventcnt = 0, uevent = esed->events;
935 epi = list_first_entry(&txlist, struct epitem, rdllink); 1058 !list_empty(head) && eventcnt < esed->maxevents;) {
1059 epi = list_first_entry(head, struct epitem, rdllink);
936 1060
937 list_del_init(&epi->rdllink); 1061 list_del_init(&epi->rdllink);
938 1062
939 /* 1063 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
940 * Get the ready file event set. We can safely use the file 1064 epi->event.events;
941 * because we are holding the "mtx" and this will guarantee
942 * that both the file and the item will not vanish.
943 */
944 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
945 revents &= epi->event.events;
946 1065
947 /* 1066 /*
948 * Is the event mask intersect the caller-requested one, 1067 * If the event mask intersect the caller-requested one,
949 * deliver the event to userspace. Again, we are holding 1068 * deliver the event to userspace. Again, ep_scan_ready_list()
950 * "mtx", so no operations coming from userspace can change 1069 * is holding "mtx", so no operations coming from userspace
951 * the item. 1070 * can change the item.
952 */ 1071 */
953 if (revents) { 1072 if (revents) {
954 if (__put_user(revents, 1073 if (__put_user(revents, &uevent->events) ||
955 &events[eventcnt].events) || 1074 __put_user(epi->event.data, &uevent->data)) {
956 __put_user(epi->event.data, 1075 list_add(&epi->rdllink, head);
957 &events[eventcnt].data)) 1076 return eventcnt ? eventcnt : -EFAULT;
958 goto errxit; 1077 }
1078 eventcnt++;
1079 uevent++;
959 if (epi->event.events & EPOLLONESHOT) 1080 if (epi->event.events & EPOLLONESHOT)
960 epi->event.events &= EP_PRIVATE_BITS; 1081 epi->event.events &= EP_PRIVATE_BITS;
961 eventcnt++; 1082 else if (!(epi->event.events & EPOLLET)) {
1083 /*
1084 * If this file has been added with Level
1085 * Trigger mode, we need to insert back inside
1086 * the ready list, so that the next call to
1087 * epoll_wait() will check again the events
1088 * availability. At this point, noone can insert
1089 * into ep->rdllist besides us. The epoll_ctl()
1090 * callers are locked out by
1091 * ep_scan_ready_list() holding "mtx" and the
1092 * poll callback will queue them in ep->ovflist.
1093 */
1094 list_add_tail(&epi->rdllink, &ep->rdllist);
1095 }
962 } 1096 }
963 /*
964 * At this point, noone can insert into ep->rdllist besides
965 * us. The epoll_ctl() callers are locked out by us holding
966 * "mtx" and the poll callback will queue them in ep->ovflist.
967 */
968 if (!(epi->event.events & EPOLLET) &&
969 (revents & epi->event.events))
970 list_add_tail(&epi->rdllink, &ep->rdllist);
971 }
972 error = 0;
973
974errxit:
975
976 spin_lock_irqsave(&ep->lock, flags);
977 /*
978 * During the time we spent in the loop above, some other events
979 * might have been queued by the poll callback. We re-insert them
980 * inside the main ready-list here.
981 */
982 for (nepi = ep->ovflist; (epi = nepi) != NULL;
983 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
984 /*
985 * If the above loop quit with errors, the epoll item might still
986 * be linked to "txlist", and the list_splice() done below will
987 * take care of those cases.
988 */
989 if (!ep_is_linked(&epi->rdllink))
990 list_add_tail(&epi->rdllink, &ep->rdllist);
991 } 1097 }
992 /*
993 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
994 * releasing the lock, events will be queued in the normal way inside
995 * ep->rdllist.
996 */
997 ep->ovflist = EP_UNACTIVE_PTR;
998 1098
999 /* 1099 return eventcnt;
1000 * In case of error in the event-send loop, or in case the number of 1100}
1001 * ready events exceeds the userspace limit, we need to splice the
1002 * "txlist" back inside ep->rdllist.
1003 */
1004 list_splice(&txlist, &ep->rdllist);
1005
1006 if (!list_empty(&ep->rdllist)) {
1007 /*
1008 * Wake up (if active) both the eventpoll wait list and the ->poll()
1009 * wait list (delayed after we release the lock).
1010 */
1011 if (waitqueue_active(&ep->wq))
1012 wake_up_locked(&ep->wq);
1013 if (waitqueue_active(&ep->poll_wait))
1014 pwake++;
1015 }
1016 spin_unlock_irqrestore(&ep->lock, flags);
1017 1101
1018 mutex_unlock(&ep->mtx); 1102static int ep_send_events(struct eventpoll *ep,
1103 struct epoll_event __user *events, int maxevents)
1104{
1105 struct ep_send_events_data esed;
1019 1106
1020 /* We have to call this outside the lock */ 1107 esed.maxevents = maxevents;
1021 if (pwake) 1108 esed.events = events;
1022 ep_poll_safewake(&psw, &ep->poll_wait);
1023 1109
1024 return eventcnt == 0 ? error: eventcnt; 1110 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1025} 1111}
1026 1112
1027static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1113static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1033,7 +1119,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1033 wait_queue_t wait; 1119 wait_queue_t wait;
1034 1120
1035 /* 1121 /*
1036 * Calculate the timeout by checking for the "infinite" value ( -1 ) 1122 * Calculate the timeout by checking for the "infinite" value (-1)
1037 * and the overflow condition. The passed timeout is in milliseconds, 1123 * and the overflow condition. The passed timeout is in milliseconds,
1038 * that why (t * HZ) / 1000. 1124 * that why (t * HZ) / 1000.
1039 */ 1125 */
@@ -1076,9 +1162,8 @@ retry:
1076 1162
1077 set_current_state(TASK_RUNNING); 1163 set_current_state(TASK_RUNNING);
1078 } 1164 }
1079
1080 /* Is it worth to try to dig for events ? */ 1165 /* Is it worth to try to dig for events ? */
1081 eavail = !list_empty(&ep->rdllist); 1166 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
1082 1167
1083 spin_unlock_irqrestore(&ep->lock, flags); 1168 spin_unlock_irqrestore(&ep->lock, flags);
1084 1169
@@ -1099,41 +1184,30 @@ retry:
1099 */ 1184 */
1100SYSCALL_DEFINE1(epoll_create1, int, flags) 1185SYSCALL_DEFINE1(epoll_create1, int, flags)
1101{ 1186{
1102 int error, fd = -1; 1187 int error;
1103 struct eventpoll *ep; 1188 struct eventpoll *ep = NULL;
1104 1189
1105 /* Check the EPOLL_* constant for consistency. */ 1190 /* Check the EPOLL_* constant for consistency. */
1106 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1191 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1107 1192
1108 if (flags & ~EPOLL_CLOEXEC) 1193 if (flags & ~EPOLL_CLOEXEC)
1109 return -EINVAL; 1194 return -EINVAL;
1110
1111 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1112 current, flags));
1113
1114 /* 1195 /*
1115 * Create the internal data structure ( "struct eventpoll" ). 1196 * Create the internal data structure ("struct eventpoll").
1116 */ 1197 */
1117 error = ep_alloc(&ep); 1198 error = ep_alloc(&ep);
1118 if (error < 0) { 1199 if (error < 0)
1119 fd = error; 1200 return error;
1120 goto error_return;
1121 }
1122
1123 /* 1201 /*
1124 * Creates all the items needed to setup an eventpoll file. That is, 1202 * Creates all the items needed to setup an eventpoll file. That is,
1125 * a file structure and a free file descriptor. 1203 * a file structure and a free file descriptor.
1126 */ 1204 */
1127 fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1205 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1128 flags & O_CLOEXEC); 1206 flags & O_CLOEXEC);
1129 if (fd < 0) 1207 if (error < 0)
1130 ep_free(ep); 1208 ep_free(ep);
1131 1209
1132error_return: 1210 return error;
1133 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1134 current, flags, fd));
1135
1136 return fd;
1137} 1211}
1138 1212
1139SYSCALL_DEFINE1(epoll_create, int, size) 1213SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1158,9 +1232,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1158 struct epitem *epi; 1232 struct epitem *epi;
1159 struct epoll_event epds; 1233 struct epoll_event epds;
1160 1234
1161 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
1162 current, epfd, op, fd, event));
1163
1164 error = -EFAULT; 1235 error = -EFAULT;
1165 if (ep_op_has_event(op) && 1236 if (ep_op_has_event(op) &&
1166 copy_from_user(&epds, event, sizeof(struct epoll_event))) 1237 copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -1211,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1211 case EPOLL_CTL_ADD: 1282 case EPOLL_CTL_ADD:
1212 if (!epi) { 1283 if (!epi) {
1213 epds.events |= POLLERR | POLLHUP; 1284 epds.events |= POLLERR | POLLHUP;
1214
1215 error = ep_insert(ep, &epds, tfile, fd); 1285 error = ep_insert(ep, &epds, tfile, fd);
1216 } else 1286 } else
1217 error = -EEXIST; 1287 error = -EEXIST;
@@ -1237,8 +1307,6 @@ error_tgt_fput:
1237error_fput: 1307error_fput:
1238 fput(file); 1308 fput(file);
1239error_return: 1309error_return:
1240 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
1241 current, epfd, op, fd, event, error));
1242 1310
1243 return error; 1311 return error;
1244} 1312}
@@ -1254,9 +1322,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1254 struct file *file; 1322 struct file *file;
1255 struct eventpoll *ep; 1323 struct eventpoll *ep;
1256 1324
1257 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
1258 current, epfd, events, maxevents, timeout));
1259
1260 /* The maximum number of event must be greater than zero */ 1325 /* The maximum number of event must be greater than zero */
1261 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) 1326 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1262 return -EINVAL; 1327 return -EINVAL;
@@ -1293,8 +1358,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1293error_fput: 1358error_fput:
1294 fput(file); 1359 fput(file);
1295error_return: 1360error_return:
1296 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
1297 current, epfd, events, maxevents, timeout, error));
1298 1361
1299 return error; 1362 return error;
1300} 1363}
@@ -1359,17 +1422,18 @@ static int __init eventpoll_init(void)
1359 EP_ITEM_COST; 1422 EP_ITEM_COST;
1360 1423
1361 /* Initialize the structure used to perform safe poll wait head wake ups */ 1424 /* Initialize the structure used to perform safe poll wait head wake ups */
1362 ep_poll_safewake_init(&psw); 1425 ep_nested_calls_init(&poll_safewake_ncalls);
1426
1427 /* Initialize the structure used to perform file's f_op->poll() calls */
1428 ep_nested_calls_init(&poll_readywalk_ncalls);
1363 1429
1364 /* Allocates slab cache used to allocate "struct epitem" items */ 1430 /* Allocates slab cache used to allocate "struct epitem" items */
1365 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 1431 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1366 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, 1432 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1367 NULL);
1368 1433
1369 /* Allocates slab cache used to allocate "struct eppoll_entry" */ 1434 /* Allocates slab cache used to allocate "struct eppoll_entry" */
1370 pwq_cache = kmem_cache_create("eventpoll_pwq", 1435 pwq_cache = kmem_cache_create("eventpoll_pwq",
1371 sizeof(struct eppoll_entry), 0, 1436 sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
1372 EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
1373 1437
1374 return 0; 1438 return 0;
1375} 1439}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899..53c72ad8587 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
55} 55}
56 56
57static int ext4_group_used_meta_blocks(struct super_block *sb, 57static int ext4_group_used_meta_blocks(struct super_block *sb,
58 ext4_group_t block_group) 58 ext4_group_t block_group,
59 struct ext4_group_desc *gdp)
59{ 60{
60 ext4_fsblk_t tmp; 61 ext4_fsblk_t tmp;
61 struct ext4_sb_info *sbi = EXT4_SB(sb); 62 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
63 int used_blocks = sbi->s_itb_per_group + 2; 64 int used_blocks = sbi->s_itb_per_group + 2;
64 65
65 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 66 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
66 struct ext4_group_desc *gdp;
67 struct buffer_head *bh;
68
69 gdp = ext4_get_group_desc(sb, block_group, &bh);
70 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), 67 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
71 block_group)) 68 block_group))
72 used_blocks--; 69 used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
177 */ 174 */
178 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); 175 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
179 } 176 }
180 return free_blocks - ext4_group_used_meta_blocks(sb, block_group); 177 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
181} 178}
182 179
183 180
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
473 470
474 if (sbi->s_log_groups_per_flex) { 471 if (sbi->s_log_groups_per_flex) {
475 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 472 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
476 spin_lock(sb_bgl_lock(sbi, flex_group)); 473 atomic_add(blocks_freed,
477 sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; 474 &sbi->s_flex_groups[flex_group].free_blocks);
478 spin_unlock(sb_bgl_lock(sbi, flex_group));
479 } 475 }
480 /* 476 /*
481 * request to reload the buddy with the 477 * request to reload the buddy with the
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01a..b64789929a6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
67 unsigned int offset) 67 unsigned int offset)
68{ 68{
69 const char *error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len,
71 dir->i_sb->s_blocksize);
71 72
72 if (rlen < EXT4_DIR_REC_LEN(1)) 73 if (rlen < EXT4_DIR_REC_LEN(1))
73 error_msg = "rec_len is smaller than minimal"; 74 error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
178 * least that it is non-zero. A 179 * least that it is non-zero. A
179 * failure will be detected in the 180 * failure will be detected in the
180 * dirent test below. */ 181 * dirent test below. */
181 if (ext4_rec_len_from_disk(de->rec_len) 182 if (ext4_rec_len_from_disk(de->rec_len,
182 < EXT4_DIR_REC_LEN(1)) 183 sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
183 break; 184 break;
184 i += ext4_rec_len_from_disk(de->rec_len); 185 i += ext4_rec_len_from_disk(de->rec_len,
186 sb->s_blocksize);
185 } 187 }
186 offset = i; 188 offset = i;
187 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 189 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
203 ret = stored; 205 ret = stored;
204 goto out; 206 goto out;
205 } 207 }
206 offset += ext4_rec_len_from_disk(de->rec_len); 208 offset += ext4_rec_len_from_disk(de->rec_len,
209 sb->s_blocksize);
207 if (le32_to_cpu(de->inode)) { 210 if (le32_to_cpu(de->inode)) {
208 /* We might block in the next section 211 /* We might block in the next section
209 * if the data destination is 212 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
225 goto revalidate; 228 goto revalidate;
226 stored++; 229 stored++;
227 } 230 }
228 filp->f_pos += ext4_rec_len_from_disk(de->rec_len); 231 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
232 sb->s_blocksize);
229 } 233 }
230 offset = 0; 234 offset = 0;
231 brelse(bh); 235 brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6083bb38057..d0f15ef56de 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,14 +33,6 @@
33#undef EXT4FS_DEBUG 33#undef EXT4FS_DEBUG
34 34
35/* 35/*
36 * Define EXT4_RESERVATION to reserve data blocks for expanding files
37 */
38#define EXT4_DEFAULT_RESERVE_BLOCKS 8
39/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
40#define EXT4_MAX_RESERVE_BLOCKS 1027
41#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
42
43/*
44 * Debug code 36 * Debug code
45 */ 37 */
46#ifdef EXT4FS_DEBUG 38#ifdef EXT4FS_DEBUG
@@ -54,8 +46,6 @@
54#define ext4_debug(f, a...) do {} while (0) 46#define ext4_debug(f, a...) do {} while (0)
55#endif 47#endif
56 48
57#define EXT4_MULTIBLOCK_ALLOCATOR 1
58
59/* prefer goal again. length */ 49/* prefer goal again. length */
60#define EXT4_MB_HINT_MERGE 1 50#define EXT4_MB_HINT_MERGE 1
61/* blocks already reserved */ 51/* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
180 */ 170 */
181 171
182struct flex_groups { 172struct flex_groups {
183 __u32 free_inodes; 173 atomic_t free_inodes;
184 __u32 free_blocks; 174 atomic_t free_blocks;
175 atomic_t used_dirs;
185}; 176};
186 177
187#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ 178#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
249#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 240#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
250#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ 241#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
251 242
243/* Flags that should be inherited by new inodes from their parent. */
244#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
245 EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
246 EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
247 EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
248 EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
249
250/* Flags that are appropriate for regular files (all but dir-specific ones). */
251#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
252
253/* Flags that are appropriate for non-directories/regular files. */
254#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
255
256/* Mask out flags that are inappropriate for the given type of inode. */
257static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
258{
259 if (S_ISDIR(mode))
260 return flags;
261 else if (S_ISREG(mode))
262 return flags & EXT4_REG_FLMASK;
263 else
264 return flags & EXT4_OTHER_FLMASK;
265}
266
252/* 267/*
253 * Inode dynamic state flags 268 * Inode dynamic state flags
254 */ 269 */
@@ -256,6 +271,7 @@ struct flex_groups {
256#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ 271#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
257#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ 272#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
258#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 273#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
274#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
259 275
260/* Used to pass group descriptor data when online resize is done */ 276/* Used to pass group descriptor data when online resize is done */
261struct ext4_new_group_input { 277struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
303#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) 319#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
304#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) 320#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
305#define EXT4_IOC_MIGRATE _IO('f', 9) 321#define EXT4_IOC_MIGRATE _IO('f', 9)
322 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
306 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ 323 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
324#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
307 325
308/* 326/*
309 * ioctl commands in 32 bit emulation 327 * ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do { \
531#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ 549#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
532#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ 550#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
533#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ 551#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
534#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ 552#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
535#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ 553#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
536#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ 554#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
537#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 555#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
666 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 684 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
667 __u8 s_reserved_char_pad2; 685 __u8 s_reserved_char_pad2;
668 __le16 s_reserved_pad; 686 __le16 s_reserved_pad;
669 __u32 s_reserved[162]; /* Padding to the end of the block */ 687 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
688 __u32 s_reserved[160]; /* Padding to the end of the block */
670}; 689};
671 690
672#ifdef __KERNEL__ 691#ifdef __KERNEL__
@@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
814#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ 833#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
815 834
816/* 835/*
836 * Minimum number of groups in a flexgroup before we separate out
837 * directories into the first block group of a flexgroup
838 */
839#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
840
841/*
817 * Structure of a directory entry 842 * Structure of a directory entry
818 */ 843 */
819#define EXT4_NAME_LEN 255 844#define EXT4_NAME_LEN 255
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
865 ~EXT4_DIR_ROUND) 890 ~EXT4_DIR_ROUND)
866#define EXT4_MAX_REC_LEN ((1<<16)-1) 891#define EXT4_MAX_REC_LEN ((1<<16)-1)
867 892
868static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
869{
870 unsigned len = le16_to_cpu(dlen);
871
872 if (len == EXT4_MAX_REC_LEN || len == 0)
873 return 1 << 16;
874 return len;
875}
876
877static inline __le16 ext4_rec_len_to_disk(unsigned len)
878{
879 if (len == (1 << 16))
880 return cpu_to_le16(EXT4_MAX_REC_LEN);
881 else if (len > (1 << 16))
882 BUG();
883 return cpu_to_le16(len);
884}
885
886/* 893/*
887 * Hash Tree Directory indexing 894 * Hash Tree Directory indexing
888 * (c) Daniel Phillips, 2001 895 * (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
970 977
971extern struct proc_dir_entry *ext4_proc_root; 978extern struct proc_dir_entry *ext4_proc_root;
972 979
973#ifdef CONFIG_PROC_FS
974extern const struct file_operations ext4_ui_proc_fops;
975
976#define EXT4_PROC_HANDLER(name, var) \
977do { \
978 proc = proc_create_data(name, mode, sbi->s_proc, \
979 &ext4_ui_proc_fops, &sbi->s_##var); \
980 if (proc == NULL) { \
981 printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
982 goto err_out; \
983 } \
984} while (0)
985#else
986#define EXT4_PROC_HANDLER(name, var)
987#endif
988
989/* 980/*
990 * Function prototypes 981 * Function prototypes
991 */ 982 */
@@ -1092,13 +1083,14 @@ extern int ext4_can_truncate(struct inode *inode);
1092extern void ext4_truncate(struct inode *); 1083extern void ext4_truncate(struct inode *);
1093extern void ext4_set_inode_flags(struct inode *); 1084extern void ext4_set_inode_flags(struct inode *);
1094extern void ext4_get_inode_flags(struct ext4_inode_info *); 1085extern void ext4_get_inode_flags(struct ext4_inode_info *);
1086extern int ext4_alloc_da_blocks(struct inode *inode);
1095extern void ext4_set_aops(struct inode *inode); 1087extern void ext4_set_aops(struct inode *inode);
1096extern int ext4_writepage_trans_blocks(struct inode *); 1088extern int ext4_writepage_trans_blocks(struct inode *);
1097extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); 1089extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1090extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1099extern int ext4_block_truncate_page(handle_t *handle, 1091extern int ext4_block_truncate_page(handle_t *handle,
1100 struct address_space *mapping, loff_t from); 1092 struct address_space *mapping, loff_t from);
1101extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); 1093extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1102extern qsize_t ext4_get_reserved_space(struct inode *inode); 1094extern qsize_t ext4_get_reserved_space(struct inode *inode);
1103 1095
1104/* ioctl.c */ 1096/* ioctl.c */
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1107 1099
1108/* migrate.c */ 1100/* migrate.c */
1109extern int ext4_ext_migrate(struct inode *); 1101extern int ext4_ext_migrate(struct inode *);
1102
1110/* namei.c */ 1103/* namei.c */
1104extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
1105extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
1111extern int ext4_orphan_add(handle_t *, struct inode *); 1106extern int ext4_orphan_add(handle_t *, struct inode *);
1112extern int ext4_orphan_del(handle_t *, struct inode *); 1107extern int ext4_orphan_del(handle_t *, struct inode *);
1113extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 1108extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbb..f0c3ec85bd4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
241extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, 241extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
242 ext4_lblk_t *, ext4_fsblk_t *); 242 ext4_lblk_t *, ext4_fsblk_t *);
243extern void ext4_ext_drop_refs(struct ext4_ext_path *); 243extern void ext4_ext_drop_refs(struct ext4_ext_path *);
244extern int ext4_ext_check_inode(struct inode *inode);
244#endif /* _EXT4_EXTENTS */ 245#endif /* _EXT4_EXTENTS */
245 246
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c..4ce2187123a 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned int ext4_group_t; 34typedef unsigned int ext4_group_t;
35 35
36#define rsv_start rsv_window._rsv_start
37#define rsv_end rsv_window._rsv_end
38
39/* 36/*
40 * storage for cached extent 37 * storage for cached extent
41 */ 38 */
@@ -125,6 +122,9 @@ struct ext4_inode_info {
125 struct list_head i_prealloc_list; 122 struct list_head i_prealloc_list;
126 spinlock_t i_prealloc_lock; 123 spinlock_t i_prealloc_lock;
127 124
125 /* ialloc */
126 ext4_group_t i_last_alloc_group;
127
128 /* allocation reservation info for delalloc */ 128 /* allocation reservation info for delalloc */
129 unsigned int i_reserved_data_blocks; 129 unsigned int i_reserved_data_blocks;
130 unsigned int i_reserved_meta_blocks; 130 unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a04..57b71fefbcc 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
62 struct percpu_counter s_freeinodes_counter; 62 struct percpu_counter s_freeinodes_counter;
63 struct percpu_counter s_dirs_counter; 63 struct percpu_counter s_dirs_counter;
64 struct percpu_counter s_dirtyblocks_counter; 64 struct percpu_counter s_dirtyblocks_counter;
65 struct blockgroup_lock s_blockgroup_lock; 65 struct blockgroup_lock *s_blockgroup_lock;
66 struct proc_dir_entry *s_proc; 66 struct proc_dir_entry *s_proc;
67 67 struct kobject s_kobj;
68 /* root of the per fs reservation window tree */ 68 struct completion s_kobj_unregister;
69 spinlock_t s_rsv_window_lock;
70 struct rb_root s_rsv_window_root;
71 69
72 /* Journaling */ 70 /* Journaling */
73 struct inode *s_journal_inode; 71 struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
146 /* locality groups */ 144 /* locality groups */
147 struct ext4_locality_group *s_locality_groups; 145 struct ext4_locality_group *s_locality_groups;
148 146
147 /* for write statistics */
148 unsigned long s_sectors_written_start;
149 u64 s_kbytes_written;
150
149 unsigned int s_log_groups_per_flex; 151 unsigned int s_log_groups_per_flex;
150 struct flex_groups *s_flex_groups; 152 struct flex_groups *s_flex_groups;
151}; 153};
@@ -153,7 +155,7 @@ struct ext4_sb_info {
153static inline spinlock_t * 155static inline spinlock_t *
154sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) 156sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
155{ 157{
156 return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); 158 return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
157} 159}
158 160
159#endif /* _EXT4_SB */ 161#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f59..ac77d8b8251 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
152 ext4_fsblk_t bg_start; 152 ext4_fsblk_t bg_start;
153 ext4_fsblk_t last_block; 153 ext4_fsblk_t last_block;
154 ext4_grpblk_t colour; 154 ext4_grpblk_t colour;
155 ext4_group_t block_group;
156 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
155 int depth; 157 int depth;
156 158
157 if (path) { 159 if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
170 } 172 }
171 173
172 /* OK. use inode's group */ 174 /* OK. use inode's group */
173 bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 175 block_group = ei->i_block_group;
176 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
177 /*
178 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
179 * block groups per flexgroup, reserve the first block
180 * group for directories and special files. Regular
181 * files will start at the second block group. This
182 * tends to speed up directory access and improves
183 * fsck times.
184 */
185 block_group &= ~(flex_size-1);
186 if (S_ISREG(inode->i_mode))
187 block_group++;
188 }
189 bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
174 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); 190 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
175 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 191 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
176 192
193 /*
194 * If we are doing delayed allocation, we don't need take
195 * colour into account.
196 */
197 if (test_opt(inode->i_sb, DELALLOC))
198 return bg_start;
199
177 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 200 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
178 colour = (current->pid % 16) * 201 colour = (current->pid % 16) *
179 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 202 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
301 return max; 324 return max;
302} 325}
303 326
304static int __ext4_ext_check_header(const char *function, struct inode *inode, 327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328{
329 ext4_fsblk_t block = ext_pblock(ext);
330 int len = ext4_ext_get_actual_len(ext);
331 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
332 if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
333 ((block + len) > ext4_blocks_count(es))))
334 return 0;
335 else
336 return 1;
337}
338
339static int ext4_valid_extent_idx(struct inode *inode,
340 struct ext4_extent_idx *ext_idx)
341{
342 ext4_fsblk_t block = idx_pblock(ext_idx);
343 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
344 if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
345 (block > ext4_blocks_count(es))))
346 return 0;
347 else
348 return 1;
349}
350
351static int ext4_valid_extent_entries(struct inode *inode,
352 struct ext4_extent_header *eh,
353 int depth)
354{
355 struct ext4_extent *ext;
356 struct ext4_extent_idx *ext_idx;
357 unsigned short entries;
358 if (eh->eh_entries == 0)
359 return 1;
360
361 entries = le16_to_cpu(eh->eh_entries);
362
363 if (depth == 0) {
364 /* leaf entries */
365 ext = EXT_FIRST_EXTENT(eh);
366 while (entries) {
367 if (!ext4_valid_extent(inode, ext))
368 return 0;
369 ext++;
370 entries--;
371 }
372 } else {
373 ext_idx = EXT_FIRST_INDEX(eh);
374 while (entries) {
375 if (!ext4_valid_extent_idx(inode, ext_idx))
376 return 0;
377 ext_idx++;
378 entries--;
379 }
380 }
381 return 1;
382}
383
384static int __ext4_ext_check(const char *function, struct inode *inode,
305 struct ext4_extent_header *eh, 385 struct ext4_extent_header *eh,
306 int depth) 386 int depth)
307{ 387{
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
329 error_msg = "invalid eh_entries"; 409 error_msg = "invalid eh_entries";
330 goto corrupted; 410 goto corrupted;
331 } 411 }
412 if (!ext4_valid_extent_entries(inode, eh, depth)) {
413 error_msg = "invalid extent entries";
414 goto corrupted;
415 }
332 return 0; 416 return 0;
333 417
334corrupted: 418corrupted:
335 ext4_error(inode->i_sb, function, 419 ext4_error(inode->i_sb, function,
336 "bad header in inode #%lu: %s - magic %x, " 420 "bad header/extent in inode #%lu: %s - magic %x, "
337 "entries %u, max %u(%u), depth %u(%u)", 421 "entries %u, max %u(%u), depth %u(%u)",
338 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 422 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
339 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 423 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
342 return -EIO; 426 return -EIO;
343} 427}
344 428
345#define ext4_ext_check_header(inode, eh, depth) \ 429#define ext4_ext_check(inode, eh, depth) \
346 __ext4_ext_check_header(__func__, inode, eh, depth) 430 __ext4_ext_check(__func__, inode, eh, depth)
431
432int ext4_ext_check_inode(struct inode *inode)
433{
434 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
435}
347 436
348#ifdef EXT_DEBUG 437#ifdef EXT_DEBUG
349static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 438static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
547 636
548 eh = ext_inode_hdr(inode); 637 eh = ext_inode_hdr(inode);
549 depth = ext_depth(inode); 638 depth = ext_depth(inode);
550 if (ext4_ext_check_header(inode, eh, depth))
551 return ERR_PTR(-EIO);
552
553 639
554 /* account possible depth increase */ 640 /* account possible depth increase */
555 if (!path) { 641 if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
565 i = depth; 651 i = depth;
566 /* walk through the tree */ 652 /* walk through the tree */
567 while (i) { 653 while (i) {
654 int need_to_validate = 0;
655
568 ext_debug("depth %d: num %d, max %d\n", 656 ext_debug("depth %d: num %d, max %d\n",
569 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 657 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
570 658
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
573 path[ppos].p_depth = i; 661 path[ppos].p_depth = i;
574 path[ppos].p_ext = NULL; 662 path[ppos].p_ext = NULL;
575 663
576 bh = sb_bread(inode->i_sb, path[ppos].p_block); 664 bh = sb_getblk(inode->i_sb, path[ppos].p_block);
577 if (!bh) 665 if (unlikely(!bh))
578 goto err; 666 goto err;
579 667 if (!bh_uptodate_or_lock(bh)) {
668 if (bh_submit_read(bh) < 0) {
669 put_bh(bh);
670 goto err;
671 }
672 /* validate the extent entries */
673 need_to_validate = 1;
674 }
580 eh = ext_block_hdr(bh); 675 eh = ext_block_hdr(bh);
581 ppos++; 676 ppos++;
582 BUG_ON(ppos > depth); 677 BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
584 path[ppos].p_hdr = eh; 679 path[ppos].p_hdr = eh;
585 i--; 680 i--;
586 681
587 if (ext4_ext_check_header(inode, eh, i)) 682 if (need_to_validate && ext4_ext_check(inode, eh, i))
588 goto err; 683 goto err;
589 } 684 }
590 685
@@ -1181,7 +1276,7 @@ got_index:
1181 return -EIO; 1276 return -EIO;
1182 eh = ext_block_hdr(bh); 1277 eh = ext_block_hdr(bh);
1183 /* subtract from p_depth to get proper eh_depth */ 1278 /* subtract from p_depth to get proper eh_depth */
1184 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { 1279 if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1185 put_bh(bh); 1280 put_bh(bh);
1186 return -EIO; 1281 return -EIO;
1187 } 1282 }
@@ -1194,7 +1289,7 @@ got_index:
1194 if (bh == NULL) 1289 if (bh == NULL)
1195 return -EIO; 1290 return -EIO;
1196 eh = ext_block_hdr(bh); 1291 eh = ext_block_hdr(bh);
1197 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { 1292 if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1198 put_bh(bh); 1293 put_bh(bh);
1199 return -EIO; 1294 return -EIO;
1200 } 1295 }
@@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2137 return -ENOMEM; 2232 return -ENOMEM;
2138 } 2233 }
2139 path[0].p_hdr = ext_inode_hdr(inode); 2234 path[0].p_hdr = ext_inode_hdr(inode);
2140 if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { 2235 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2141 err = -EIO; 2236 err = -EIO;
2142 goto out; 2237 goto out;
2143 } 2238 }
@@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2191 err = -EIO; 2286 err = -EIO;
2192 break; 2287 break;
2193 } 2288 }
2194 if (ext4_ext_check_header(inode, ext_block_hdr(bh), 2289 if (ext4_ext_check(inode, ext_block_hdr(bh),
2195 depth - i - 1)) { 2290 depth - i - 1)) {
2196 err = -EIO; 2291 err = -EIO;
2197 break; 2292 break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a0..588af8c7724 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
33 */ 33 */
34static int ext4_release_file(struct inode *inode, struct file *filp) 34static int ext4_release_file(struct inode *inode, struct file *filp)
35{ 35{
36 if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
37 ext4_alloc_da_blocks(inode);
38 EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
39 }
36 /* if we are the last writer on the inode, drop the block reservation */ 40 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 41 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 42 (atomic_read(&inode->i_writecount) == 1) &&
43 !EXT4_I(inode)->i_reserved_data_blocks)
39 { 44 {
40 down_write(&EXT4_I(inode)->i_data_sem); 45 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_preallocations(inode); 46 ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8..47b84e8df56 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
189 struct ext4_super_block *es; 189 struct ext4_super_block *es;
190 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
191 int fatal = 0, err, count, cleared; 191 int fatal = 0, err, count, cleared;
192 ext4_group_t flex_group;
193 192
194 if (atomic_read(&inode->i_count) > 1) { 193 if (atomic_read(&inode->i_count) > 1) {
195 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", 194 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
268 if (is_directory) { 267 if (is_directory) {
269 count = ext4_used_dirs_count(sb, gdp) - 1; 268 count = ext4_used_dirs_count(sb, gdp) - 1;
270 ext4_used_dirs_set(sb, gdp, count); 269 ext4_used_dirs_set(sb, gdp, count);
270 if (sbi->s_log_groups_per_flex) {
271 ext4_group_t f;
272
273 f = ext4_flex_group(sbi, block_group);
274 atomic_dec(&sbi->s_flex_groups[f].free_inodes);
275 }
276
271 } 277 }
272 gdp->bg_checksum = ext4_group_desc_csum(sbi, 278 gdp->bg_checksum = ext4_group_desc_csum(sbi,
273 block_group, gdp); 279 block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
277 percpu_counter_dec(&sbi->s_dirs_counter); 283 percpu_counter_dec(&sbi->s_dirs_counter);
278 284
279 if (sbi->s_log_groups_per_flex) { 285 if (sbi->s_log_groups_per_flex) {
280 flex_group = ext4_flex_group(sbi, block_group); 286 ext4_group_t f;
281 spin_lock(sb_bgl_lock(sbi, flex_group)); 287
282 sbi->s_flex_groups[flex_group].free_inodes++; 288 f = ext4_flex_group(sbi, block_group);
283 spin_unlock(sb_bgl_lock(sbi, flex_group)); 289 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
284 } 290 }
285 } 291 }
286 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); 292 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
360 sbi->s_log_groups_per_flex; 366 sbi->s_log_groups_per_flex;
361 367
362find_close_to_parent: 368find_close_to_parent:
363 flexbg_free_blocks = flex_group[best_flex].free_blocks; 369 flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
364 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 370 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
365 if (flex_group[best_flex].free_inodes && 371 if (atomic_read(&flex_group[best_flex].free_inodes) &&
366 flex_freeb_ratio > free_block_ratio) 372 flex_freeb_ratio > free_block_ratio)
367 goto found_flexbg; 373 goto found_flexbg;
368 374
@@ -375,24 +381,24 @@ find_close_to_parent:
375 if (i == parent_fbg_group || i == parent_fbg_group - 1) 381 if (i == parent_fbg_group || i == parent_fbg_group - 1)
376 continue; 382 continue;
377 383
378 flexbg_free_blocks = flex_group[i].free_blocks; 384 flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
379 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 385 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
380 386
381 if (flex_freeb_ratio > free_block_ratio && 387 if (flex_freeb_ratio > free_block_ratio &&
382 flex_group[i].free_inodes) { 388 (atomic_read(&flex_group[i].free_inodes))) {
383 best_flex = i; 389 best_flex = i;
384 goto found_flexbg; 390 goto found_flexbg;
385 } 391 }
386 392
387 if (flex_group[best_flex].free_inodes == 0 || 393 if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
388 (flex_group[i].free_blocks > 394 ((atomic_read(&flex_group[i].free_blocks) >
389 flex_group[best_flex].free_blocks && 395 atomic_read(&flex_group[best_flex].free_blocks)) &&
390 flex_group[i].free_inodes)) 396 atomic_read(&flex_group[i].free_inodes)))
391 best_flex = i; 397 best_flex = i;
392 } 398 }
393 399
394 if (!flex_group[best_flex].free_inodes || 400 if (!atomic_read(&flex_group[best_flex].free_inodes) ||
395 !flex_group[best_flex].free_blocks) 401 !atomic_read(&flex_group[best_flex].free_blocks))
396 return -1; 402 return -1;
397 403
398found_flexbg: 404found_flexbg:
@@ -410,6 +416,42 @@ out:
410 return 0; 416 return 0;
411} 417}
412 418
419struct orlov_stats {
420 __u32 free_inodes;
421 __u32 free_blocks;
422 __u32 used_dirs;
423};
424
425/*
426 * Helper function for Orlov's allocator; returns critical information
427 * for a particular block group or flex_bg. If flex_size is 1, then g
428 * is a block group number; otherwise it is flex_bg number.
429 */
430void get_orlov_stats(struct super_block *sb, ext4_group_t g,
431 int flex_size, struct orlov_stats *stats)
432{
433 struct ext4_group_desc *desc;
434 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
435
436 if (flex_size > 1) {
437 stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
438 stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
439 stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
440 return;
441 }
442
443 desc = ext4_get_group_desc(sb, g, NULL);
444 if (desc) {
445 stats->free_inodes = ext4_free_inodes_count(sb, desc);
446 stats->free_blocks = ext4_free_blks_count(sb, desc);
447 stats->used_dirs = ext4_used_dirs_count(sb, desc);
448 } else {
449 stats->free_inodes = 0;
450 stats->free_blocks = 0;
451 stats->used_dirs = 0;
452 }
453}
454
413/* 455/*
414 * Orlov's allocator for directories. 456 * Orlov's allocator for directories.
415 * 457 *
@@ -425,35 +467,34 @@ out:
425 * it has too many directories already (max_dirs) or 467 * it has too many directories already (max_dirs) or
426 * it has too few free inodes left (min_inodes) or 468 * it has too few free inodes left (min_inodes) or
427 * it has too few free blocks left (min_blocks) or 469 * it has too few free blocks left (min_blocks) or
428 * it's already running too large debt (max_debt).
429 * Parent's group is preferred, if it doesn't satisfy these 470 * Parent's group is preferred, if it doesn't satisfy these
430 * conditions we search cyclically through the rest. If none 471 * conditions we search cyclically through the rest. If none
431 * of the groups look good we just look for a group with more 472 * of the groups look good we just look for a group with more
432 * free inodes than average (starting at parent's group). 473 * free inodes than average (starting at parent's group).
433 *
434 * Debt is incremented each time we allocate a directory and decremented
435 * when we allocate an inode, within 0--255.
436 */ 474 */
437 475
438#define INODE_COST 64
439#define BLOCK_COST 256
440
441static int find_group_orlov(struct super_block *sb, struct inode *parent, 476static int find_group_orlov(struct super_block *sb, struct inode *parent,
442 ext4_group_t *group) 477 ext4_group_t *group, int mode)
443{ 478{
444 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 479 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
445 struct ext4_sb_info *sbi = EXT4_SB(sb); 480 struct ext4_sb_info *sbi = EXT4_SB(sb);
446 struct ext4_super_block *es = sbi->s_es;
447 ext4_group_t ngroups = sbi->s_groups_count; 481 ext4_group_t ngroups = sbi->s_groups_count;
448 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 482 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
449 unsigned int freei, avefreei; 483 unsigned int freei, avefreei;
450 ext4_fsblk_t freeb, avefreeb; 484 ext4_fsblk_t freeb, avefreeb;
451 ext4_fsblk_t blocks_per_dir;
452 unsigned int ndirs; 485 unsigned int ndirs;
453 int max_debt, max_dirs, min_inodes; 486 int max_dirs, min_inodes;
454 ext4_grpblk_t min_blocks; 487 ext4_grpblk_t min_blocks;
455 ext4_group_t i; 488 ext4_group_t i, grp, g;
456 struct ext4_group_desc *desc; 489 struct ext4_group_desc *desc;
490 struct orlov_stats stats;
491 int flex_size = ext4_flex_bg_size(sbi);
492
493 if (flex_size > 1) {
494 ngroups = (ngroups + flex_size - 1) >>
495 sbi->s_log_groups_per_flex;
496 parent_group >>= sbi->s_log_groups_per_flex;
497 }
457 498
458 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 499 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
459 avefreei = freei / ngroups; 500 avefreei = freei / ngroups;
@@ -462,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
462 do_div(avefreeb, ngroups); 503 do_div(avefreeb, ngroups);
463 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); 504 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
464 505
465 if ((parent == sb->s_root->d_inode) || 506 if (S_ISDIR(mode) &&
466 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { 507 ((parent == sb->s_root->d_inode) ||
508 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
467 int best_ndir = inodes_per_group; 509 int best_ndir = inodes_per_group;
468 ext4_group_t grp;
469 int ret = -1; 510 int ret = -1;
470 511
471 get_random_bytes(&grp, sizeof(grp)); 512 get_random_bytes(&grp, sizeof(grp));
472 parent_group = (unsigned)grp % ngroups; 513 parent_group = (unsigned)grp % ngroups;
473 for (i = 0; i < ngroups; i++) { 514 for (i = 0; i < ngroups; i++) {
474 grp = (parent_group + i) % ngroups; 515 g = (parent_group + i) % ngroups;
475 desc = ext4_get_group_desc(sb, grp, NULL); 516 get_orlov_stats(sb, g, flex_size, &stats);
476 if (!desc || !ext4_free_inodes_count(sb, desc)) 517 if (!stats.free_inodes)
477 continue; 518 continue;
478 if (ext4_used_dirs_count(sb, desc) >= best_ndir) 519 if (stats.used_dirs >= best_ndir)
479 continue; 520 continue;
480 if (ext4_free_inodes_count(sb, desc) < avefreei) 521 if (stats.free_inodes < avefreei)
481 continue; 522 continue;
482 if (ext4_free_blks_count(sb, desc) < avefreeb) 523 if (stats.free_blocks < avefreeb)
483 continue; 524 continue;
484 *group = grp; 525 grp = g;
485 ret = 0; 526 ret = 0;
486 best_ndir = ext4_used_dirs_count(sb, desc); 527 best_ndir = stats.used_dirs;
528 }
529 if (ret)
530 goto fallback;
531 found_flex_bg:
532 if (flex_size == 1) {
533 *group = grp;
534 return 0;
535 }
536
537 /*
538 * We pack inodes at the beginning of the flexgroup's
539 * inode tables. Block allocation decisions will do
540 * something similar, although regular files will
541 * start at 2nd block group of the flexgroup. See
542 * ext4_ext_find_goal() and ext4_find_near().
543 */
544 grp *= flex_size;
545 for (i = 0; i < flex_size; i++) {
546 if (grp+i >= sbi->s_groups_count)
547 break;
548 desc = ext4_get_group_desc(sb, grp+i, NULL);
549 if (desc && ext4_free_inodes_count(sb, desc)) {
550 *group = grp+i;
551 return 0;
552 }
487 } 553 }
488 if (ret == 0)
489 return ret;
490 goto fallback; 554 goto fallback;
491 } 555 }
492 556
493 blocks_per_dir = ext4_blocks_count(es) - freeb;
494 do_div(blocks_per_dir, ndirs);
495
496 max_dirs = ndirs / ngroups + inodes_per_group / 16; 557 max_dirs = ndirs / ngroups + inodes_per_group / 16;
497 min_inodes = avefreei - inodes_per_group / 4; 558 min_inodes = avefreei - inodes_per_group*flex_size / 4;
498 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; 559 if (min_inodes < 1)
499 560 min_inodes = 1;
500 max_debt = EXT4_BLOCKS_PER_GROUP(sb); 561 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
501 max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); 562
502 if (max_debt * INODE_COST > inodes_per_group) 563 /*
503 max_debt = inodes_per_group / INODE_COST; 564 * Start looking in the flex group where we last allocated an
504 if (max_debt > 255) 565 * inode for this parent directory
505 max_debt = 255; 566 */
506 if (max_debt == 0) 567 if (EXT4_I(parent)->i_last_alloc_group != ~0) {
507 max_debt = 1; 568 parent_group = EXT4_I(parent)->i_last_alloc_group;
569 if (flex_size > 1)
570 parent_group >>= sbi->s_log_groups_per_flex;
571 }
508 572
509 for (i = 0; i < ngroups; i++) { 573 for (i = 0; i < ngroups; i++) {
510 *group = (parent_group + i) % ngroups; 574 grp = (parent_group + i) % ngroups;
511 desc = ext4_get_group_desc(sb, *group, NULL); 575 get_orlov_stats(sb, grp, flex_size, &stats);
512 if (!desc || !ext4_free_inodes_count(sb, desc)) 576 if (stats.used_dirs >= max_dirs)
513 continue;
514 if (ext4_used_dirs_count(sb, desc) >= max_dirs)
515 continue; 577 continue;
516 if (ext4_free_inodes_count(sb, desc) < min_inodes) 578 if (stats.free_inodes < min_inodes)
517 continue; 579 continue;
518 if (ext4_free_blks_count(sb, desc) < min_blocks) 580 if (stats.free_blocks < min_blocks)
519 continue; 581 continue;
520 return 0; 582 goto found_flex_bg;
521 } 583 }
522 584
523fallback: 585fallback:
586 ngroups = sbi->s_groups_count;
587 avefreei = freei / ngroups;
588 parent_group = EXT4_I(parent)->i_block_group;
524 for (i = 0; i < ngroups; i++) { 589 for (i = 0; i < ngroups; i++) {
525 *group = (parent_group + i) % ngroups; 590 grp = (parent_group + i) % ngroups;
526 desc = ext4_get_group_desc(sb, *group, NULL); 591 desc = ext4_get_group_desc(sb, grp, NULL);
527 if (desc && ext4_free_inodes_count(sb, desc) && 592 if (desc && ext4_free_inodes_count(sb, desc) &&
528 ext4_free_inodes_count(sb, desc) >= avefreei) 593 ext4_free_inodes_count(sb, desc) >= avefreei) {
594 *group = grp;
529 return 0; 595 return 0;
596 }
530 } 597 }
531 598
532 if (avefreei) { 599 if (avefreei) {
@@ -542,12 +609,51 @@ fallback:
542} 609}
543 610
544static int find_group_other(struct super_block *sb, struct inode *parent, 611static int find_group_other(struct super_block *sb, struct inode *parent,
545 ext4_group_t *group) 612 ext4_group_t *group, int mode)
546{ 613{
547 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 614 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
548 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 615 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
549 struct ext4_group_desc *desc; 616 struct ext4_group_desc *desc;
550 ext4_group_t i; 617 ext4_group_t i, last;
618 int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
619
620 /*
621 * Try to place the inode is the same flex group as its
622 * parent. If we can't find space, use the Orlov algorithm to
623 * find another flex group, and store that information in the
624 * parent directory's inode information so that use that flex
625 * group for future allocations.
626 */
627 if (flex_size > 1) {
628 int retry = 0;
629
630 try_again:
631 parent_group &= ~(flex_size-1);
632 last = parent_group + flex_size;
633 if (last > ngroups)
634 last = ngroups;
635 for (i = parent_group; i < last; i++) {
636 desc = ext4_get_group_desc(sb, i, NULL);
637 if (desc && ext4_free_inodes_count(sb, desc)) {
638 *group = i;
639 return 0;
640 }
641 }
642 if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
643 retry = 1;
644 parent_group = EXT4_I(parent)->i_last_alloc_group;
645 goto try_again;
646 }
647 /*
648 * If this didn't work, use the Orlov search algorithm
649 * to find a new flex group; we pass in the mode to
650 * avoid the topdir algorithms.
651 */
652 *group = parent_group + flex_size;
653 if (*group > ngroups)
654 *group = 0;
655 return find_group_orlov(sb, parent, group, mode);
656 }
551 657
552 /* 658 /*
553 * Try to place the inode in its parent directory 659 * Try to place the inode in its parent directory
@@ -665,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
665 if (S_ISDIR(mode)) { 771 if (S_ISDIR(mode)) {
666 count = ext4_used_dirs_count(sb, gdp) + 1; 772 count = ext4_used_dirs_count(sb, gdp) + 1;
667 ext4_used_dirs_set(sb, gdp, count); 773 ext4_used_dirs_set(sb, gdp, count);
774 if (sbi->s_log_groups_per_flex) {
775 ext4_group_t f = ext4_flex_group(sbi, group);
776
777 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
778 }
668 } 779 }
669 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 780 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
670err_ret: 781err_ret:
@@ -716,10 +827,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
716 sbi = EXT4_SB(sb); 827 sbi = EXT4_SB(sb);
717 es = sbi->s_es; 828 es = sbi->s_es;
718 829
719 if (sbi->s_log_groups_per_flex) { 830 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
720 ret2 = find_group_flex(sb, dir, &group); 831 ret2 = find_group_flex(sb, dir, &group);
721 if (ret2 == -1) { 832 if (ret2 == -1) {
722 ret2 = find_group_other(sb, dir, &group); 833 ret2 = find_group_other(sb, dir, &group, mode);
723 if (ret2 == 0 && once) 834 if (ret2 == 0 && once)
724 once = 0; 835 once = 0;
725 printk(KERN_NOTICE "ext4: find_group_flex " 836 printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
733 if (test_opt(sb, OLDALLOC)) 844 if (test_opt(sb, OLDALLOC))
734 ret2 = find_group_dir(sb, dir, &group); 845 ret2 = find_group_dir(sb, dir, &group);
735 else 846 else
736 ret2 = find_group_orlov(sb, dir, &group); 847 ret2 = find_group_orlov(sb, dir, &group, mode);
737 } else 848 } else
738 ret2 = find_group_other(sb, dir, &group); 849 ret2 = find_group_other(sb, dir, &group, mode);
739 850
740got_group: 851got_group:
852 EXT4_I(dir)->i_last_alloc_group = group;
741 err = -ENOSPC; 853 err = -ENOSPC;
742 if (ret2 == -1) 854 if (ret2 == -1)
743 goto out; 855 goto out;
@@ -858,9 +970,7 @@ got:
858 970
859 if (sbi->s_log_groups_per_flex) { 971 if (sbi->s_log_groups_per_flex) {
860 flex_group = ext4_flex_group(sbi, group); 972 flex_group = ext4_flex_group(sbi, group);
861 spin_lock(sb_bgl_lock(sbi, flex_group)); 973 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
862 sbi->s_flex_groups[flex_group].free_inodes--;
863 spin_unlock(sb_bgl_lock(sbi, flex_group));
864 } 974 }
865 975
866 inode->i_uid = current_fsuid(); 976 inode->i_uid = current_fsuid();
@@ -885,19 +995,16 @@ got:
885 ei->i_disksize = 0; 995 ei->i_disksize = 0;
886 996
887 /* 997 /*
888 * Don't inherit extent flag from directory. We set extent flag on 998 * Don't inherit extent flag from directory, amongst others. We set
889 * newly created directory and file only if -o extent mount option is 999 * extent flag on newly created directory and file only if -o extent
890 * specified 1000 * mount option is specified
891 */ 1001 */
892 ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); 1002 ei->i_flags =
893 if (S_ISLNK(mode)) 1003 ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
894 ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
895 /* dirsync only applies to directories */
896 if (!S_ISDIR(mode))
897 ei->i_flags &= ~EXT4_DIRSYNC_FL;
898 ei->i_file_acl = 0; 1004 ei->i_file_acl = 0;
899 ei->i_dtime = 0; 1005 ei->i_dtime = 0;
900 ei->i_block_group = group; 1006 ei->i_block_group = group;
1007 ei->i_last_alloc_group = ~0;
901 1008
902 ext4_set_inode_flags(inode); 1009 ext4_set_inode_flags(inode);
903 if (IS_DIRSYNC(inode)) 1010 if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71d3ecd5db7..a2e7952bc5f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
371 return n; 371 return n;
372} 372}
373 373
374static int __ext4_check_blockref(const char *function, struct inode *inode,
375 unsigned int *p, unsigned int max) {
376
377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
378 unsigned int *bref = p;
379 while (bref < p+max) {
380 if (unlikely(*bref >= maxblocks)) {
381 ext4_error(inode->i_sb, function,
382 "block reference %u >= max (%u) "
383 "in inode #%lu, offset=%d",
384 *bref, maxblocks,
385 inode->i_ino, (int)(bref-p));
386 return -EIO;
387 }
388 bref++;
389 }
390 return 0;
391}
392
393
394#define ext4_check_indirect_blockref(inode, bh) \
395 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
396 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
397
398#define ext4_check_inode_blockref(inode) \
399 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
400 EXT4_NDIR_BLOCKS)
401
374/** 402/**
375 * ext4_get_branch - read the chain of indirect blocks leading to data 403 * ext4_get_branch - read the chain of indirect blocks leading to data
376 * @inode: inode in question 404 * @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
415 if (!p->key) 443 if (!p->key)
416 goto no_block; 444 goto no_block;
417 while (--depth) { 445 while (--depth) {
418 bh = sb_bread(sb, le32_to_cpu(p->key)); 446 bh = sb_getblk(sb, le32_to_cpu(p->key));
419 if (!bh) 447 if (unlikely(!bh))
420 goto failure; 448 goto failure;
449
450 if (!bh_uptodate_or_lock(bh)) {
451 if (bh_submit_read(bh) < 0) {
452 put_bh(bh);
453 goto failure;
454 }
455 /* validate block references */
456 if (ext4_check_indirect_blockref(inode, bh)) {
457 put_bh(bh);
458 goto failure;
459 }
460 }
461
421 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 462 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
422 /* Reader: end */ 463 /* Reader: end */
423 if (!p->key) 464 if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
459 ext4_fsblk_t bg_start; 500 ext4_fsblk_t bg_start;
460 ext4_fsblk_t last_block; 501 ext4_fsblk_t last_block;
461 ext4_grpblk_t colour; 502 ext4_grpblk_t colour;
503 ext4_group_t block_group;
504 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
462 505
463 /* Try to find previous block */ 506 /* Try to find previous block */
464 for (p = ind->p - 1; p >= start; p--) { 507 for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
474 * It is going to be referred to from the inode itself? OK, just put it 517 * It is going to be referred to from the inode itself? OK, just put it
475 * into the same cylinder group then. 518 * into the same cylinder group then.
476 */ 519 */
477 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 520 block_group = ei->i_block_group;
521 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
522 block_group &= ~(flex_size-1);
523 if (S_ISREG(inode->i_mode))
524 block_group++;
525 }
526 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
478 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 527 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
479 528
529 /*
530 * If we are doing delayed allocation, we don't need take
531 * colour into account.
532 */
533 if (test_opt(inode->i_sb, DELALLOC))
534 return bg_start;
535
480 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 536 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
481 colour = (current->pid % 16) * 537 colour = (current->pid % 16) *
482 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 538 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -1052,9 +1108,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1052 /* 1108 /*
1053 * free those over-booking quota for metadata blocks 1109 * free those over-booking quota for metadata blocks
1054 */ 1110 */
1055
1056 if (mdb_free) 1111 if (mdb_free)
1057 vfs_dq_release_reservation_block(inode, mdb_free); 1112 vfs_dq_release_reservation_block(inode, mdb_free);
1113
1114 /*
1115 * If we have done all the pending block allocations and if
1116 * there aren't any writers on the inode, we can discard the
1117 * inode's preallocations.
1118 */
1119 if (!total && (atomic_read(&inode->i_writecount) == 0))
1120 ext4_discard_preallocations(inode);
1058} 1121}
1059 1122
1060/* 1123/*
@@ -1688,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
1688 1751
1689struct mpage_da_data { 1752struct mpage_da_data {
1690 struct inode *inode; 1753 struct inode *inode;
1691 struct buffer_head lbh; /* extent of blocks */ 1754 sector_t b_blocknr; /* start block number of extent */
1755 size_t b_size; /* size of extent */
1756 unsigned long b_state; /* state of the extent */
1692 unsigned long first_page, next_page; /* extent of pages */ 1757 unsigned long first_page, next_page; /* extent of pages */
1693 get_block_t *get_block;
1694 struct writeback_control *wbc; 1758 struct writeback_control *wbc;
1695 int io_done; 1759 int io_done;
1696 int pages_written; 1760 int pages_written;
@@ -1704,7 +1768,6 @@ struct mpage_da_data {
1704 * @mpd->inode: inode 1768 * @mpd->inode: inode
1705 * @mpd->first_page: first page of the extent 1769 * @mpd->first_page: first page of the extent
1706 * @mpd->next_page: page after the last page of the extent 1770 * @mpd->next_page: page after the last page of the extent
1707 * @mpd->get_block: the filesystem's block mapper function
1708 * 1771 *
1709 * By the time mpage_da_submit_io() is called we expect all blocks 1772 * By the time mpage_da_submit_io() is called we expect all blocks
1710 * to be allocated. this may be wrong if allocation failed. 1773 * to be allocated. this may be wrong if allocation failed.
@@ -1724,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1724 /* 1787 /*
1725 * We need to start from the first_page to the next_page - 1 1788 * We need to start from the first_page to the next_page - 1
1726 * to make sure we also write the mapped dirty buffer_heads. 1789 * to make sure we also write the mapped dirty buffer_heads.
1727 * If we look at mpd->lbh.b_blocknr we would only be looking 1790 * If we look at mpd->b_blocknr we would only be looking
1728 * at the currently mapped buffer_heads. 1791 * at the currently mapped buffer_heads.
1729 */ 1792 */
1730 index = mpd->first_page; 1793 index = mpd->first_page;
@@ -1914,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
1914 return; 1977 return;
1915} 1978}
1916 1979
1980#define EXT4_DELALLOC_RSVED 1
1981static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1982 struct buffer_head *bh_result, int create)
1983{
1984 int ret;
1985 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1986 loff_t disksize = EXT4_I(inode)->i_disksize;
1987 handle_t *handle = NULL;
1988
1989 handle = ext4_journal_current_handle();
1990 BUG_ON(!handle);
1991 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
1992 bh_result, create, 0, EXT4_DELALLOC_RSVED);
1993 if (ret <= 0)
1994 return ret;
1995
1996 bh_result->b_size = (ret << inode->i_blkbits);
1997
1998 if (ext4_should_order_data(inode)) {
1999 int retval;
2000 retval = ext4_jbd2_file_inode(handle, inode);
2001 if (retval)
2002 /*
2003 * Failed to add inode for ordered mode. Don't
2004 * update file size
2005 */
2006 return retval;
2007 }
2008
2009 /*
2010 * Update on-disk size along with block allocation we don't
2011 * use 'extend_disksize' as size may change within already
2012 * allocated block -bzzz
2013 */
2014 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2015 if (disksize > i_size_read(inode))
2016 disksize = i_size_read(inode);
2017 if (disksize > EXT4_I(inode)->i_disksize) {
2018 ext4_update_i_disksize(inode, disksize);
2019 ret = ext4_mark_inode_dirty(handle, inode);
2020 return ret;
2021 }
2022 return 0;
2023}
2024
1917/* 2025/*
1918 * mpage_da_map_blocks - go through given space 2026 * mpage_da_map_blocks - go through given space
1919 * 2027 *
1920 * @mpd->lbh - bh describing space 2028 * @mpd - bh describing space
1921 * @mpd->get_block - the filesystem's block mapper function
1922 * 2029 *
1923 * The function skips space we know is already mapped to disk blocks. 2030 * The function skips space we know is already mapped to disk blocks.
1924 * 2031 *
1925 */ 2032 */
1926static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2033static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1927{ 2034{
1928 int err = 0; 2035 int err = 0;
1929 struct buffer_head new; 2036 struct buffer_head new;
1930 struct buffer_head *lbh = &mpd->lbh;
1931 sector_t next; 2037 sector_t next;
1932 2038
1933 /* 2039 /*
1934 * We consider only non-mapped and non-allocated blocks 2040 * We consider only non-mapped and non-allocated blocks
1935 */ 2041 */
1936 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 2042 if ((mpd->b_state & (1 << BH_Mapped)) &&
2043 !(mpd->b_state & (1 << BH_Delay)))
1937 return 0; 2044 return 0;
1938 new.b_state = lbh->b_state; 2045 new.b_state = mpd->b_state;
1939 new.b_blocknr = 0; 2046 new.b_blocknr = 0;
1940 new.b_size = lbh->b_size; 2047 new.b_size = mpd->b_size;
1941 next = lbh->b_blocknr; 2048 next = mpd->b_blocknr;
1942 /* 2049 /*
1943 * If we didn't accumulate anything 2050 * If we didn't accumulate anything
1944 * to write simply return 2051 * to write simply return
1945 */ 2052 */
1946 if (!new.b_size) 2053 if (!new.b_size)
1947 return 0; 2054 return 0;
1948 err = mpd->get_block(mpd->inode, next, &new, 1);
1949 if (err) {
1950 2055
1951 /* If get block returns with error 2056 err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
1952 * we simply return. Later writepage 2057 if (err) {
1953 * will redirty the page and writepages 2058 /*
1954 * will find the dirty page again 2059 * If get block returns with error we simply
2060 * return. Later writepage will redirty the page and
2061 * writepages will find the dirty page again
1955 */ 2062 */
1956 if (err == -EAGAIN) 2063 if (err == -EAGAIN)
1957 return 0; 2064 return 0;
1958 2065
1959 if (err == -ENOSPC && 2066 if (err == -ENOSPC &&
1960 ext4_count_free_blocks(mpd->inode->i_sb)) { 2067 ext4_count_free_blocks(mpd->inode->i_sb)) {
1961 mpd->retval = err; 2068 mpd->retval = err;
1962 return 0; 2069 return 0;
1963 } 2070 }
1964 2071
1965 /* 2072 /*
1966 * get block failure will cause us 2073 * get block failure will cause us to loop in
1967 * to loop in writepages. Because 2074 * writepages, because a_ops->writepage won't be able
1968 * a_ops->writepage won't be able to 2075 * to make progress. The page will be redirtied by
1969 * make progress. The page will be redirtied 2076 * writepage and writepages will again try to write
1970 * by writepage and writepages will again 2077 * the same.
1971 * try to write the same.
1972 */ 2078 */
1973 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2079 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1974 "at logical offset %llu with max blocks " 2080 "at logical offset %llu with max blocks "
1975 "%zd with error %d\n", 2081 "%zd with error %d\n",
1976 __func__, mpd->inode->i_ino, 2082 __func__, mpd->inode->i_ino,
1977 (unsigned long long)next, 2083 (unsigned long long)next,
1978 lbh->b_size >> mpd->inode->i_blkbits, err); 2084 mpd->b_size >> mpd->inode->i_blkbits, err);
1979 printk(KERN_EMERG "This should not happen.!! " 2085 printk(KERN_EMERG "This should not happen.!! "
1980 "Data will be lost\n"); 2086 "Data will be lost\n");
1981 if (err == -ENOSPC) { 2087 if (err == -ENOSPC) {
@@ -1983,7 +2089,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1983 } 2089 }
1984 /* invlaidate all the pages */ 2090 /* invlaidate all the pages */
1985 ext4_da_block_invalidatepages(mpd, next, 2091 ext4_da_block_invalidatepages(mpd, next,
1986 lbh->b_size >> mpd->inode->i_blkbits); 2092 mpd->b_size >> mpd->inode->i_blkbits);
1987 return err; 2093 return err;
1988 } 2094 }
1989 BUG_ON(new.b_size == 0); 2095 BUG_ON(new.b_size == 0);
@@ -1995,7 +2101,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1995 * If blocks are delayed marked, we need to 2101 * If blocks are delayed marked, we need to
1996 * put actual blocknr and drop delayed bit 2102 * put actual blocknr and drop delayed bit
1997 */ 2103 */
1998 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 2104 if ((mpd->b_state & (1 << BH_Delay)) ||
2105 (mpd->b_state & (1 << BH_Unwritten)))
1999 mpage_put_bnr_to_bhs(mpd, next, &new); 2106 mpage_put_bnr_to_bhs(mpd, next, &new);
2000 2107
2001 return 0; 2108 return 0;
@@ -2014,12 +2121,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2014 * the function is used to collect contig. blocks in same state 2121 * the function is used to collect contig. blocks in same state
2015 */ 2122 */
2016static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2123static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2017 sector_t logical, struct buffer_head *bh) 2124 sector_t logical, size_t b_size,
2125 unsigned long b_state)
2018{ 2126{
2019 sector_t next; 2127 sector_t next;
2020 size_t b_size = bh->b_size; 2128 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2021 struct buffer_head *lbh = &mpd->lbh;
2022 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
2023 2129
2024 /* check if thereserved journal credits might overflow */ 2130 /* check if thereserved journal credits might overflow */
2025 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2131 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2046,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2046 /* 2152 /*
2047 * First block in the extent 2153 * First block in the extent
2048 */ 2154 */
2049 if (lbh->b_size == 0) { 2155 if (mpd->b_size == 0) {
2050 lbh->b_blocknr = logical; 2156 mpd->b_blocknr = logical;
2051 lbh->b_size = b_size; 2157 mpd->b_size = b_size;
2052 lbh->b_state = bh->b_state & BH_FLAGS; 2158 mpd->b_state = b_state & BH_FLAGS;
2053 return; 2159 return;
2054 } 2160 }
2055 2161
2056 next = lbh->b_blocknr + nrblocks; 2162 next = mpd->b_blocknr + nrblocks;
2057 /* 2163 /*
2058 * Can we merge the block to our big extent? 2164 * Can we merge the block to our big extent?
2059 */ 2165 */
2060 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 2166 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2061 lbh->b_size += b_size; 2167 mpd->b_size += b_size;
2062 return; 2168 return;
2063 } 2169 }
2064 2170
@@ -2087,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
2087{ 2193{
2088 struct mpage_da_data *mpd = data; 2194 struct mpage_da_data *mpd = data;
2089 struct inode *inode = mpd->inode; 2195 struct inode *inode = mpd->inode;
2090 struct buffer_head *bh, *head, fake; 2196 struct buffer_head *bh, *head;
2091 sector_t logical; 2197 sector_t logical;
2092 2198
2093 if (mpd->io_done) { 2199 if (mpd->io_done) {
@@ -2129,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
2129 /* 2235 /*
2130 * ... and blocks 2236 * ... and blocks
2131 */ 2237 */
2132 mpd->lbh.b_size = 0; 2238 mpd->b_size = 0;
2133 mpd->lbh.b_state = 0; 2239 mpd->b_state = 0;
2134 mpd->lbh.b_blocknr = 0; 2240 mpd->b_blocknr = 0;
2135 } 2241 }
2136 2242
2137 mpd->next_page = page->index + 1; 2243 mpd->next_page = page->index + 1;
@@ -2139,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
2139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2245 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2140 2246
2141 if (!page_has_buffers(page)) { 2247 if (!page_has_buffers(page)) {
2142 /* 2248 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2143 * There is no attached buffer heads yet (mmap?) 2249 (1 << BH_Dirty) | (1 << BH_Uptodate));
2144 * we treat the page asfull of dirty blocks
2145 */
2146 bh = &fake;
2147 bh->b_size = PAGE_CACHE_SIZE;
2148 bh->b_state = 0;
2149 set_buffer_dirty(bh);
2150 set_buffer_uptodate(bh);
2151 mpage_add_bh_to_extent(mpd, logical, bh);
2152 if (mpd->io_done) 2250 if (mpd->io_done)
2153 return MPAGE_DA_EXTENT_TAIL; 2251 return MPAGE_DA_EXTENT_TAIL;
2154 } else { 2252 } else {
@@ -2166,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
2166 * with the page in ext4_da_writepage 2264 * with the page in ext4_da_writepage
2167 */ 2265 */
2168 if (buffer_dirty(bh) && 2266 if (buffer_dirty(bh) &&
2169 (!buffer_mapped(bh) || buffer_delay(bh))) { 2267 (!buffer_mapped(bh) || buffer_delay(bh))) {
2170 mpage_add_bh_to_extent(mpd, logical, bh); 2268 mpage_add_bh_to_extent(mpd, logical,
2269 bh->b_size,
2270 bh->b_state);
2171 if (mpd->io_done) 2271 if (mpd->io_done)
2172 return MPAGE_DA_EXTENT_TAIL; 2272 return MPAGE_DA_EXTENT_TAIL;
2173 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2273 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2179,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
2179 * unmapped buffer_head later we need to 2279 * unmapped buffer_head later we need to
2180 * use the b_state flag of that buffer_head. 2280 * use the b_state flag of that buffer_head.
2181 */ 2281 */
2182 if (mpd->lbh.b_size == 0) 2282 if (mpd->b_size == 0)
2183 mpd->lbh.b_state = 2283 mpd->b_state = bh->b_state & BH_FLAGS;
2184 bh->b_state & BH_FLAGS;
2185 } 2284 }
2186 logical++; 2285 logical++;
2187 } while ((bh = bh->b_this_page) != head); 2286 } while ((bh = bh->b_this_page) != head);
@@ -2191,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page,
2191} 2290}
2192 2291
2193/* 2292/*
2194 * mpage_da_writepages - walk the list of dirty pages of the given
2195 * address space, allocates non-allocated blocks, maps newly-allocated
2196 * blocks to existing bhs and issue IO them
2197 *
2198 * @mapping: address space structure to write
2199 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2200 * @get_block: the filesystem's block mapper function.
2201 *
2202 * This is a library function, which implements the writepages()
2203 * address_space_operation.
2204 */
2205static int mpage_da_writepages(struct address_space *mapping,
2206 struct writeback_control *wbc,
2207 struct mpage_da_data *mpd)
2208{
2209 int ret;
2210
2211 if (!mpd->get_block)
2212 return generic_writepages(mapping, wbc);
2213
2214 mpd->lbh.b_size = 0;
2215 mpd->lbh.b_state = 0;
2216 mpd->lbh.b_blocknr = 0;
2217 mpd->first_page = 0;
2218 mpd->next_page = 0;
2219 mpd->io_done = 0;
2220 mpd->pages_written = 0;
2221 mpd->retval = 0;
2222
2223 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
2224 /*
2225 * Handle last extent of pages
2226 */
2227 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
2228 if (mpage_da_map_blocks(mpd) == 0)
2229 mpage_da_submit_io(mpd);
2230
2231 mpd->io_done = 1;
2232 ret = MPAGE_DA_EXTENT_TAIL;
2233 }
2234 wbc->nr_to_write -= mpd->pages_written;
2235 return ret;
2236}
2237
2238/*
2239 * this is a special callback for ->write_begin() only 2293 * this is a special callback for ->write_begin() only
2240 * it's intention is to return mapped block or reserve space 2294 * it's intention is to return mapped block or reserve space
2241 */ 2295 */
@@ -2274,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2274 2328
2275 return ret; 2329 return ret;
2276} 2330}
2277#define EXT4_DELALLOC_RSVED 1
2278static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2279 struct buffer_head *bh_result, int create)
2280{
2281 int ret;
2282 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2283 loff_t disksize = EXT4_I(inode)->i_disksize;
2284 handle_t *handle = NULL;
2285
2286 handle = ext4_journal_current_handle();
2287 BUG_ON(!handle);
2288 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2289 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2290 if (ret > 0) {
2291
2292 bh_result->b_size = (ret << inode->i_blkbits);
2293
2294 if (ext4_should_order_data(inode)) {
2295 int retval;
2296 retval = ext4_jbd2_file_inode(handle, inode);
2297 if (retval)
2298 /*
2299 * Failed to add inode for ordered
2300 * mode. Don't update file size
2301 */
2302 return retval;
2303 }
2304
2305 /*
2306 * Update on-disk size along with block allocation
2307 * we don't use 'extend_disksize' as size may change
2308 * within already allocated block -bzzz
2309 */
2310 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2311 if (disksize > i_size_read(inode))
2312 disksize = i_size_read(inode);
2313 if (disksize > EXT4_I(inode)->i_disksize) {
2314 ext4_update_i_disksize(inode, disksize);
2315 ret = ext4_mark_inode_dirty(handle, inode);
2316 return ret;
2317 }
2318 ret = 0;
2319 }
2320 return ret;
2321}
2322 2331
2323static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2332static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2324{ 2333{
@@ -2569,8 +2578,38 @@ retry:
2569 dump_stack(); 2578 dump_stack();
2570 goto out_writepages; 2579 goto out_writepages;
2571 } 2580 }
2572 mpd.get_block = ext4_da_get_block_write; 2581
2573 ret = mpage_da_writepages(mapping, wbc, &mpd); 2582 /*
2583 * Now call __mpage_da_writepage to find the next
2584 * contiguous region of logical blocks that need
2585 * blocks to be allocated by ext4. We don't actually
2586 * submit the blocks for I/O here, even though
2587 * write_cache_pages thinks it will, and will set the
2588 * pages as clean for write before calling
2589 * __mpage_da_writepage().
2590 */
2591 mpd.b_size = 0;
2592 mpd.b_state = 0;
2593 mpd.b_blocknr = 0;
2594 mpd.first_page = 0;
2595 mpd.next_page = 0;
2596 mpd.io_done = 0;
2597 mpd.pages_written = 0;
2598 mpd.retval = 0;
2599 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
2600 &mpd);
2601 /*
2602 * If we have a contigous extent of pages and we
2603 * haven't done the I/O yet, map the blocks and submit
2604 * them for I/O.
2605 */
2606 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2607 if (mpage_da_map_blocks(&mpd) == 0)
2608 mpage_da_submit_io(&mpd);
2609 mpd.io_done = 1;
2610 ret = MPAGE_DA_EXTENT_TAIL;
2611 }
2612 wbc->nr_to_write -= mpd.pages_written;
2574 2613
2575 ext4_journal_stop(handle); 2614 ext4_journal_stop(handle);
2576 2615
@@ -2846,6 +2885,48 @@ out:
2846 return; 2885 return;
2847} 2886}
2848 2887
2888/*
2889 * Force all delayed allocation blocks to be allocated for a given inode.
2890 */
2891int ext4_alloc_da_blocks(struct inode *inode)
2892{
2893 if (!EXT4_I(inode)->i_reserved_data_blocks &&
2894 !EXT4_I(inode)->i_reserved_meta_blocks)
2895 return 0;
2896
2897 /*
2898 * We do something simple for now. The filemap_flush() will
2899 * also start triggering a write of the data blocks, which is
2900 * not strictly speaking necessary (and for users of
2901 * laptop_mode, not even desirable). However, to do otherwise
2902 * would require replicating code paths in:
2903 *
2904 * ext4_da_writepages() ->
2905 * write_cache_pages() ---> (via passed in callback function)
2906 * __mpage_da_writepage() -->
2907 * mpage_add_bh_to_extent()
2908 * mpage_da_map_blocks()
2909 *
2910 * The problem is that write_cache_pages(), located in
2911 * mm/page-writeback.c, marks pages clean in preparation for
2912 * doing I/O, which is not desirable if we're not planning on
2913 * doing I/O at all.
2914 *
2915 * We could call write_cache_pages(), and then redirty all of
2916 * the pages by calling redirty_page_for_writeback() but that
2917 * would be ugly in the extreme. So instead we would need to
2918 * replicate parts of the code in the above functions,
2919 * simplifying them becuase we wouldn't actually intend to
2920 * write out the pages, but rather only collect contiguous
2921 * logical block extents, call the multi-block allocator, and
2922 * then update the buffer heads with the block allocations.
2923 *
2924 * For now, though, we'll cheat by calling filemap_flush(),
2925 * which will map the blocks, and start the I/O, but not
2926 * actually wait for the I/O to complete.
2927 */
2928 return filemap_flush(inode->i_mapping);
2929}
2849 2930
2850/* 2931/*
2851 * bmap() is special. It gets used by applications such as lilo and by 2932 * bmap() is special. It gets used by applications such as lilo and by
@@ -3868,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
3868 if (!ext4_can_truncate(inode)) 3949 if (!ext4_can_truncate(inode))
3869 return; 3950 return;
3870 3951
3952 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3953 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3954
3871 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3955 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
3872 ext4_ext_truncate(inode); 3956 ext4_ext_truncate(inode);
3873 return; 3957 return;
@@ -4110,12 +4194,7 @@ make_io:
4110 unsigned num; 4194 unsigned num;
4111 4195
4112 table = ext4_inode_table(sb, gdp); 4196 table = ext4_inode_table(sb, gdp);
4113 /* Make sure s_inode_readahead_blks is a power of 2 */ 4197 /* s_inode_readahead_blks is always a power of 2 */
4114 while (EXT4_SB(sb)->s_inode_readahead_blks &
4115 (EXT4_SB(sb)->s_inode_readahead_blks-1))
4116 EXT4_SB(sb)->s_inode_readahead_blks =
4117 (EXT4_SB(sb)->s_inode_readahead_blks &
4118 (EXT4_SB(sb)->s_inode_readahead_blks-1));
4119 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4198 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4120 if (table > b) 4199 if (table > b)
4121 b = table; 4200 b = table;
@@ -4287,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4287 ei->i_disksize = inode->i_size; 4366 ei->i_disksize = inode->i_size;
4288 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4367 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4289 ei->i_block_group = iloc.block_group; 4368 ei->i_block_group = iloc.block_group;
4369 ei->i_last_alloc_group = ~0;
4290 /* 4370 /*
4291 * NOTE! The in-memory inode i_data array is in little-endian order 4371 * NOTE! The in-memory inode i_data array is in little-endian order
4292 * even on big-endian machines: we do NOT byteswap the block numbers! 4372 * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4329,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4329 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4409 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4330 } 4410 }
4331 4411
4412 if (ei->i_flags & EXT4_EXTENTS_FL) {
4413 /* Validate extent which is part of inode */
4414 ret = ext4_ext_check_inode(inode);
4415 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4416 (S_ISLNK(inode->i_mode) &&
4417 !ext4_inode_is_fast_symlink(inode))) {
4418 /* Validate block references which are part of inode */
4419 ret = ext4_check_inode_blockref(inode);
4420 }
4421 if (ret) {
4422 brelse(bh);
4423 goto bad_inode;
4424 }
4425
4332 if (S_ISREG(inode->i_mode)) { 4426 if (S_ISREG(inode->i_mode)) {
4333 inode->i_op = &ext4_file_inode_operations; 4427 inode->i_op = &ext4_file_inode_operations;
4334 inode->i_fop = &ext4_file_operations; 4428 inode->i_fop = &ext4_file_operations;
@@ -4345,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4345 inode->i_op = &ext4_symlink_inode_operations; 4439 inode->i_op = &ext4_symlink_inode_operations;
4346 ext4_set_aops(inode); 4440 ext4_set_aops(inode);
4347 } 4441 }
4348 } else { 4442 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4443 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4349 inode->i_op = &ext4_special_inode_operations; 4444 inode->i_op = &ext4_special_inode_operations;
4350 if (raw_inode->i_block[0]) 4445 if (raw_inode->i_block[0])
4351 init_special_inode(inode, inode->i_mode, 4446 init_special_inode(inode, inode->i_mode,
@@ -4353,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4353 else 4448 else
4354 init_special_inode(inode, inode->i_mode, 4449 init_special_inode(inode, inode->i_mode,
4355 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4450 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4451 } else {
4452 brelse(bh);
4453 ret = -EIO;
4454 ext4_error(inode->i_sb, __func__,
4455 "bogus i_mode (%o) for inode=%lu",
4456 inode->i_mode, inode->i_ino);
4457 goto bad_inode;
4356 } 4458 }
4357 brelse(iloc.bh); 4459 brelse(iloc.bh);
4358 ext4_set_inode_flags(inode); 4460 ext4_set_inode_flags(inode);
@@ -5146,8 +5248,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5146 return !buffer_mapped(bh); 5248 return !buffer_mapped(bh);
5147} 5249}
5148 5250
5149int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) 5251int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5150{ 5252{
5253 struct page *page = vmf->page;
5151 loff_t size; 5254 loff_t size;
5152 unsigned long len; 5255 unsigned long len;
5153 int ret = -EINVAL; 5256 int ret = -EINVAL;
@@ -5199,6 +5302,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
5199 goto out_unlock; 5302 goto out_unlock;
5200 ret = 0; 5303 ret = 0;
5201out_unlock: 5304out_unlock:
5305 if (ret)
5306 ret = VM_FAULT_SIGBUS;
5202 up_read(&inode->i_alloc_sem); 5307 up_read(&inode->i_alloc_sem);
5203 return ret; 5308 return ret;
5204} 5309}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247..91e75f7a9e7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
48 if (err) 48 if (err)
49 return err; 49 return err;
50 50
51 if (!S_ISDIR(inode->i_mode)) 51 flags = ext4_mask_flags(inode->i_mode, flags);
52 flags &= ~EXT4_DIRSYNC_FL;
53 52
54 err = -EPERM; 53 err = -EPERM;
55 mutex_lock(&inode->i_mutex); 54 mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
263 return err; 262 return err;
264 } 263 }
265 264
265 case EXT4_IOC_ALLOC_DA_BLKS:
266 {
267 int err;
268 if (!is_owner_or_cap(inode))
269 return -EACCES;
270
271 err = mnt_want_write(filp->f_path.mnt);
272 if (err)
273 return err;
274 err = ext4_alloc_da_blocks(inode);
275 mnt_drop_write(filp->f_path.mnt);
276 return err;
277 }
278
266 default: 279 default:
267 return -ENOTTY; 280 return -ENOTTY;
268 } 281 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b038188bd03..f871677a798 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
46 * The allocation request involve request for multiple number of blocks 46 * The allocation request involve request for multiple number of blocks
47 * near to the goal(block) value specified. 47 * near to the goal(block) value specified.
48 * 48 *
49 * During initialization phase of the allocator we decide to use the group 49 * During initialization phase of the allocator we decide to use the
50 * preallocation or inode preallocation depending on the size file. The 50 * group preallocation or inode preallocation depending on the size of
51 * size of the file could be the resulting file size we would have after 51 * the file. The size of the file could be the resulting file size we
52 * allocation or the current file size which ever is larger. If the size is 52 * would have after allocation, or the current file size, which ever
53 * less that sbi->s_mb_stream_request we select the group 53 * is larger. If the size is less than sbi->s_mb_stream_request we
54 * preallocation. The default value of s_mb_stream_request is 16 54 * select to use the group preallocation. The default value of
55 * blocks. This can also be tuned via 55 * s_mb_stream_request is 16 blocks. This can also be tuned via
56 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms 56 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
57 * of number of blocks. 57 * terms of number of blocks.
58 * 58 *
59 * The main motivation for having small file use group preallocation is to 59 * The main motivation for having small file use group preallocation is to
60 * ensure that we have small file closer in the disk. 60 * ensure that we have small files closer together on the disk.
61 * 61 *
62 * First stage the allocator looks at the inode prealloc list 62 * First stage the allocator looks at the inode prealloc list,
63 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for 63 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
64 * this particular inode. The inode prealloc space is represented as: 64 * spaces for this particular inode. The inode prealloc space is
65 * represented as:
65 * 66 *
66 * pa_lstart -> the logical start block for this prealloc space 67 * pa_lstart -> the logical start block for this prealloc space
67 * pa_pstart -> the physical start block for this prealloc space 68 * pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
121 * list. In case of inode preallocation we follow a list of heuristics 122 * list. In case of inode preallocation we follow a list of heuristics
122 * based on file size. This can be found in ext4_mb_normalize_request. If 123 * based on file size. This can be found in ext4_mb_normalize_request. If
123 * we are doing a group prealloc we try to normalize the request to 124 * we are doing a group prealloc we try to normalize the request to
124 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to 125 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
125 * 512 blocks. This can be tuned via 126 * 512 blocks. This can be tuned via
126 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in 127 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
127 * terms of number of blocks. If we have mounted the file system with -O 128 * terms of number of blocks. If we have mounted the file system with -O
128 * stripe=<value> option the group prealloc request is normalized to the 129 * stripe=<value> option the group prealloc request is normalized to the
129 * stripe value (sbi->s_stripe) 130 * stripe value (sbi->s_stripe)
130 * 131 *
131 * The regular allocator(using the buddy cache) support few tunables. 132 * The regular allocator(using the buddy cache) supports few tunables.
132 * 133 *
133 * /proc/fs/ext4/<partition>/min_to_scan 134 * /sys/fs/ext4/<partition>/mb_min_to_scan
134 * /proc/fs/ext4/<partition>/max_to_scan 135 * /sys/fs/ext4/<partition>/mb_max_to_scan
135 * /proc/fs/ext4/<partition>/order2_req 136 * /sys/fs/ext4/<partition>/mb_order2_req
136 * 137 *
137 * The regular allocator use buddy scan only if the request len is power of 138 * The regular allocator uses buddy scan only if the request len is power of
138 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 139 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
139 * value of s_mb_order2_reqs can be tuned via 140 * value of s_mb_order2_reqs can be tuned via
140 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to 141 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
141 * stripe size (sbi->s_stripe), we try to search for contigous block in 142 * stripe size (sbi->s_stripe), we try to search for contigous block in
142 * stripe size. This should result in better allocation on RAID setup. If 143 * stripe size. This should result in better allocation on RAID setups. If
143 * not we search in the specific group using bitmap for best extents. The 144 * not, we search in the specific group using bitmap for best extents. The
144 * tunable min_to_scan and max_to_scan controll the behaviour here. 145 * tunable min_to_scan and max_to_scan control the behaviour here.
145 * min_to_scan indicate how long the mballoc __must__ look for a best 146 * min_to_scan indicate how long the mballoc __must__ look for a best
146 * extent and max_to_scanindicate how long the mballoc __can__ look for a 147 * extent and max_to_scan indicates how long the mballoc __can__ look for a
147 * best extent in the found extents. Searching for the blocks starts with 148 * best extent in the found extents. Searching for the blocks starts with
148 * the group specified as the goal value in allocation context via 149 * the group specified as the goal value in allocation context via
149 * ac_g_ex. Each group is first checked based on the criteria whether it 150 * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
337 ext4_group_t group); 338 ext4_group_t group);
338static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 339static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
339 ext4_group_t group); 340 ext4_group_t group);
340static int ext4_mb_init_per_dev_proc(struct super_block *sb);
341static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
342static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 341static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
343 342
344 343
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1726{ 1725{
1727 unsigned free, fragments; 1726 unsigned free, fragments;
1728 unsigned i, bits; 1727 unsigned i, bits;
1728 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1729 struct ext4_group_desc *desc; 1729 struct ext4_group_desc *desc;
1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1731 1731
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) 1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1748 return 0; 1748 return 0;
1749 1749
1750 /* Avoid using the first bg of a flexgroup for data files */
1751 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1752 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1753 ((group % flex_size) == 0))
1754 return 0;
1755
1750 bits = ac->ac_sb->s_blocksize_bits + 1; 1756 bits = ac->ac_sb->s_blocksize_bits + 1;
1751 for (i = ac->ac_2order; i <= bits; i++) 1757 for (i = ac->ac_2order; i <= bits; i++)
1752 if (grp->bb_counters[i] > 0) 1758 if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1971 /* 1977 /*
1972 * We search using buddy data only if the order of the request 1978 * We search using buddy data only if the order of the request
1973 * is greater than equal to the sbi_s_mb_order2_reqs 1979 * is greater than equal to the sbi_s_mb_order2_reqs
1974 * You can tune it via /proc/fs/ext4/<partition>/order2_req 1980 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
1975 */ 1981 */
1976 if (i >= sbi->s_mb_order2_reqs) { 1982 if (i >= sbi->s_mb_order2_reqs) {
1977 /* 1983 /*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2693 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2699 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
2694 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2700 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2695 if (sbi->s_mb_maxs == NULL) { 2701 if (sbi->s_mb_maxs == NULL) {
2696 kfree(sbi->s_mb_maxs); 2702 kfree(sbi->s_mb_offsets);
2697 return -ENOMEM; 2703 return -ENOMEM;
2698 } 2704 }
2699 2705
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2746 spin_lock_init(&lg->lg_prealloc_lock); 2752 spin_lock_init(&lg->lg_prealloc_lock);
2747 } 2753 }
2748 2754
2749 ext4_mb_init_per_dev_proc(sb);
2750 ext4_mb_history_init(sb); 2755 ext4_mb_history_init(sb);
2751 2756
2752 if (sbi->s_journal) 2757 if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
2829 2834
2830 free_percpu(sbi->s_locality_groups); 2835 free_percpu(sbi->s_locality_groups);
2831 ext4_mb_history_release(sb); 2836 ext4_mb_history_release(sb);
2832 ext4_mb_destroy_per_dev_proc(sb);
2833 2837
2834 return 0; 2838 return 0;
2835} 2839}
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2890 mb_debug("freed %u blocks in %u structures\n", count, count2); 2894 mb_debug("freed %u blocks in %u structures\n", count, count2);
2891} 2895}
2892 2896
2893#define EXT4_MB_STATS_NAME "stats"
2894#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
2895#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
2896#define EXT4_MB_ORDER2_REQ "order2_req"
2897#define EXT4_MB_STREAM_REQ "stream_req"
2898#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2899
2900static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2901{
2902#ifdef CONFIG_PROC_FS
2903 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2904 struct ext4_sb_info *sbi = EXT4_SB(sb);
2905 struct proc_dir_entry *proc;
2906
2907 if (sbi->s_proc == NULL)
2908 return -EINVAL;
2909
2910 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
2911 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
2912 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
2913 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
2914 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
2915 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
2916 return 0;
2917
2918err_out:
2919 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2920 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2921 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2922 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2923 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2924 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2925 return -ENOMEM;
2926#else
2927 return 0;
2928#endif
2929}
2930
2931static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2932{
2933#ifdef CONFIG_PROC_FS
2934 struct ext4_sb_info *sbi = EXT4_SB(sb);
2935
2936 if (sbi->s_proc == NULL)
2937 return -EINVAL;
2938
2939 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2940 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2941 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2942 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2943 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2944 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2945#endif
2946 return 0;
2947}
2948
2949int __init init_ext4_mballoc(void) 2897int __init init_ext4_mballoc(void)
2950{ 2898{
2951 ext4_pspace_cachep = 2899 ext4_pspace_cachep =
@@ -3096,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3096 if (sbi->s_log_groups_per_flex) { 3044 if (sbi->s_log_groups_per_flex) {
3097 ext4_group_t flex_group = ext4_flex_group(sbi, 3045 ext4_group_t flex_group = ext4_flex_group(sbi,
3098 ac->ac_b_ex.fe_group); 3046 ac->ac_b_ex.fe_group);
3099 spin_lock(sb_bgl_lock(sbi, flex_group)); 3047 atomic_sub(ac->ac_b_ex.fe_len,
3100 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; 3048 &sbi->s_flex_groups[flex_group].free_blocks);
3101 spin_unlock(sb_bgl_lock(sbi, flex_group));
3102 } 3049 }
3103 3050
3104 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3051 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3116,7 +3063,7 @@ out_err:
3116 * here we normalize request for locality group 3063 * here we normalize request for locality group
3117 * Group request are normalized to s_strip size if we set the same via mount 3064 * Group request are normalized to s_strip size if we set the same via mount
3118 * option. If not we set it to s_mb_group_prealloc which can be configured via 3065 * option. If not we set it to s_mb_group_prealloc which can be configured via
3119 * /proc/fs/ext4/<partition>/group_prealloc 3066 * /sys/fs/ext4/<partition>/mb_group_prealloc
3120 * 3067 *
3121 * XXX: should we try to preallocate more than the group has now? 3068 * XXX: should we try to preallocate more than the group has now?
3122 */ 3069 */
@@ -3608,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3608 spin_unlock(&pa->pa_lock); 3555 spin_unlock(&pa->pa_lock);
3609 3556
3610 grp_blk = pa->pa_pstart; 3557 grp_blk = pa->pa_pstart;
3611 /* If linear, pa_pstart may be in the next group when pa is used up */ 3558 /*
3612 if (pa->pa_linear) 3559 * If doing group-based preallocation, pa_pstart may be in the
3560 * next group when pa is used up
3561 */
3562 if (pa->pa_type == MB_GROUP_PA)
3613 grp_blk--; 3563 grp_blk--;
3614 3564
3615 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3565 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3704,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3704 INIT_LIST_HEAD(&pa->pa_inode_list); 3654 INIT_LIST_HEAD(&pa->pa_inode_list);
3705 INIT_LIST_HEAD(&pa->pa_group_list); 3655 INIT_LIST_HEAD(&pa->pa_group_list);
3706 pa->pa_deleted = 0; 3656 pa->pa_deleted = 0;
3707 pa->pa_linear = 0; 3657 pa->pa_type = MB_INODE_PA;
3708 3658
3709 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3659 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3710 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3660 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3767,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3767 INIT_LIST_HEAD(&pa->pa_inode_list); 3717 INIT_LIST_HEAD(&pa->pa_inode_list);
3768 INIT_LIST_HEAD(&pa->pa_group_list); 3718 INIT_LIST_HEAD(&pa->pa_group_list);
3769 pa->pa_deleted = 0; 3719 pa->pa_deleted = 0;
3770 pa->pa_linear = 1; 3720 pa->pa_type = MB_GROUP_PA;
3771 3721
3772 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3722 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3773 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3723 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4021,7 +3971,7 @@ repeat:
4021 list_del_rcu(&pa->pa_inode_list); 3971 list_del_rcu(&pa->pa_inode_list);
4022 spin_unlock(pa->pa_obj_lock); 3972 spin_unlock(pa->pa_obj_lock);
4023 3973
4024 if (pa->pa_linear) 3974 if (pa->pa_type == MB_GROUP_PA)
4025 ext4_mb_release_group_pa(&e4b, pa, ac); 3975 ext4_mb_release_group_pa(&e4b, pa, ac);
4026 else 3976 else
4027 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3977 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4121,7 +4071,7 @@ repeat:
4121 spin_unlock(&ei->i_prealloc_lock); 4071 spin_unlock(&ei->i_prealloc_lock);
4122 4072
4123 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 4073 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4124 BUG_ON(pa->pa_linear != 0); 4074 BUG_ON(pa->pa_type != MB_INODE_PA);
4125 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4075 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4126 4076
4127 err = ext4_mb_load_buddy(sb, group, &e4b); 4077 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4232,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4232 * file is determined by the current size or the resulting size after 4182 * file is determined by the current size or the resulting size after
4233 * allocation which ever is larger 4183 * allocation which ever is larger
4234 * 4184 *
4235 * One can tune this size via /proc/fs/ext4/<partition>/stream_req 4185 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4236 */ 4186 */
4237static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 4187static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4238{ 4188{
@@ -4373,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4373 continue; 4323 continue;
4374 } 4324 }
4375 /* only lg prealloc space */ 4325 /* only lg prealloc space */
4376 BUG_ON(!pa->pa_linear); 4326 BUG_ON(pa->pa_type != MB_GROUP_PA);
4377 4327
4378 /* seems this one can be freed ... */ 4328 /* seems this one can be freed ... */
4379 pa->pa_deleted = 1; 4329 pa->pa_deleted = 1;
@@ -4442,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4442 pa_inode_list) { 4392 pa_inode_list) {
4443 spin_lock(&tmp_pa->pa_lock); 4393 spin_lock(&tmp_pa->pa_lock);
4444 if (tmp_pa->pa_deleted) { 4394 if (tmp_pa->pa_deleted) {
4445 spin_unlock(&pa->pa_lock); 4395 spin_unlock(&tmp_pa->pa_lock);
4446 continue; 4396 continue;
4447 } 4397 }
4448 if (!added && pa->pa_free < tmp_pa->pa_free) { 4398 if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4479,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4479{ 4429{
4480 struct ext4_prealloc_space *pa = ac->ac_pa; 4430 struct ext4_prealloc_space *pa = ac->ac_pa;
4481 if (pa) { 4431 if (pa) {
4482 if (pa->pa_linear) { 4432 if (pa->pa_type == MB_GROUP_PA) {
4483 /* see comment in ext4_mb_use_group_pa() */ 4433 /* see comment in ext4_mb_use_group_pa() */
4484 spin_lock(&pa->pa_lock); 4434 spin_lock(&pa->pa_lock);
4485 pa->pa_pstart += ac->ac_b_ex.fe_len; 4435 pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4499,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4499 * doesn't grow big. We need to release 4449 * doesn't grow big. We need to release
4500 * alloc_semp before calling ext4_mb_add_n_trim() 4450 * alloc_semp before calling ext4_mb_add_n_trim()
4501 */ 4451 */
4502 if (pa->pa_linear && likely(pa->pa_free)) { 4452 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4503 spin_lock(pa->pa_obj_lock); 4453 spin_lock(pa->pa_obj_lock);
4504 list_del_rcu(&pa->pa_inode_list); 4454 list_del_rcu(&pa->pa_inode_list);
4505 spin_unlock(pa->pa_obj_lock); 4455 spin_unlock(pa->pa_obj_lock);
@@ -4936,9 +4886,7 @@ do_more:
4936 4886
4937 if (sbi->s_log_groups_per_flex) { 4887 if (sbi->s_log_groups_per_flex) {
4938 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4888 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4939 spin_lock(sb_bgl_lock(sbi, flex_group)); 4889 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4940 sbi->s_flex_groups[flex_group].free_blocks += count;
4941 spin_unlock(sb_bgl_lock(sbi, flex_group));
4942 } 4890 }
4943 4891
4944 ext4_mb_release_desc(&e4b); 4892 ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf1..dd9e6cd5f6c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
132 ext4_lblk_t pa_lstart; /* log. block */ 132 ext4_lblk_t pa_lstart; /* log. block */
133 unsigned short pa_len; /* len of preallocated chunk */ 133 unsigned short pa_len; /* len of preallocated chunk */
134 unsigned short pa_free; /* how many blocks are free */ 134 unsigned short pa_free; /* how many blocks are free */
135 unsigned short pa_linear; /* consumed in one direction 135 unsigned short pa_type; /* pa type. inode or group */
136 * strictly, for grp prealloc */
137 spinlock_t *pa_obj_lock; 136 spinlock_t *pa_obj_lock;
138 struct inode *pa_inode; /* hack, for history only */ 137 struct inode *pa_inode; /* hack, for history only */
139}; 138};
140 139
140enum {
141 MB_INODE_PA = 0,
142 MB_GROUP_PA = 1
143};
141 144
142struct ext4_free_extent { 145struct ext4_free_extent {
143 ext4_lblk_t fe_logical; 146 ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
247 250
248#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 251#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
249 252
250struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
251static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 253static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
252 struct ext4_free_extent *fex) 254 struct ext4_free_extent *fex)
253{ 255{
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 83410244d3e..22098e1cd08 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
161 struct dx_frame *frame, 161 struct dx_frame *frame,
162 int *err); 162 int *err);
163static void dx_release(struct dx_frame *frames); 163static void dx_release(struct dx_frame *frames);
164static int dx_make_map(struct ext4_dir_entry_2 *de, int size, 164static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count); 166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, 167static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
168 struct dx_map_entry *offsets, int count); 168 struct dx_map_entry *offsets, int count, unsigned blocksize);
169static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); 169static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
170static void dx_insert_block(struct dx_frame *frame, 170static void dx_insert_block(struct dx_frame *frame,
171 u32 hash, ext4_lblk_t block); 171 u32 hash, ext4_lblk_t block);
172static int ext4_htree_next_block(struct inode *dir, __u32 hash, 172static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
180static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 180static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
181 struct inode *inode); 181 struct inode *inode);
182 182
183unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
184{
185 unsigned len = le16_to_cpu(dlen);
186
187 if (len == EXT4_MAX_REC_LEN || len == 0)
188 return blocksize;
189 return (len & 65532) | ((len & 3) << 16);
190}
191
192__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
193{
194 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
195 BUG();
196 if (len < 65536)
197 return cpu_to_le16(len);
198 if (len == blocksize) {
199 if (blocksize == 65536)
200 return cpu_to_le16(EXT4_MAX_REC_LEN);
201 else
202 return cpu_to_le16(0);
203 }
204 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
205}
206
183/* 207/*
184 * p is at least 6 bytes before the end of page 208 * p is at least 6 bytes before the end of page
185 */ 209 */
186static inline struct ext4_dir_entry_2 * 210static inline struct ext4_dir_entry_2 *
187ext4_next_entry(struct ext4_dir_entry_2 *p) 211ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
188{ 212{
189 return (struct ext4_dir_entry_2 *)((char *)p + 213 return (struct ext4_dir_entry_2 *)((char *)p +
190 ext4_rec_len_from_disk(p->rec_len)); 214 ext4_rec_len_from_disk(p->rec_len, blocksize));
191} 215}
192 216
193/* 217/*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
294 space += EXT4_DIR_REC_LEN(de->name_len); 318 space += EXT4_DIR_REC_LEN(de->name_len);
295 names++; 319 names++;
296 } 320 }
297 de = ext4_next_entry(de); 321 de = ext4_next_entry(de, size);
298 } 322 }
299 printk("(%i)\n", names); 323 printk("(%i)\n", names);
300 return (struct stats) { names, space, 1 }; 324 return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
585 top = (struct ext4_dir_entry_2 *) ((char *) de + 609 top = (struct ext4_dir_entry_2 *) ((char *) de +
586 dir->i_sb->s_blocksize - 610 dir->i_sb->s_blocksize -
587 EXT4_DIR_REC_LEN(0)); 611 EXT4_DIR_REC_LEN(0));
588 for (; de < top; de = ext4_next_entry(de)) { 612 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
589 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 613 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
590 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 614 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
591 +((char *)de - bh->b_data))) { 615 +((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
663 } 687 }
664 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { 688 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
665 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; 689 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
666 de = ext4_next_entry(de); 690 de = ext4_next_entry(de, dir->i_sb->s_blocksize);
667 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) 691 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
668 goto errout; 692 goto errout;
669 count++; 693 count++;
@@ -713,15 +737,15 @@ errout:
713 * Create map of hash values, offsets, and sizes, stored at end of block. 737 * Create map of hash values, offsets, and sizes, stored at end of block.
714 * Returns number of entries mapped. 738 * Returns number of entries mapped.
715 */ 739 */
716static int dx_make_map (struct ext4_dir_entry_2 *de, int size, 740static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
717 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) 741 struct dx_hash_info *hinfo,
742 struct dx_map_entry *map_tail)
718{ 743{
719 int count = 0; 744 int count = 0;
720 char *base = (char *) de; 745 char *base = (char *) de;
721 struct dx_hash_info h = *hinfo; 746 struct dx_hash_info h = *hinfo;
722 747
723 while ((char *) de < base + size) 748 while ((char *) de < base + blocksize) {
724 {
725 if (de->name_len && de->inode) { 749 if (de->name_len && de->inode) {
726 ext4fs_dirhash(de->name, de->name_len, &h); 750 ext4fs_dirhash(de->name, de->name_len, &h);
727 map_tail--; 751 map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
732 cond_resched(); 756 cond_resched();
733 } 757 }
734 /* XXX: do we need to check rec_len == 0 case? -Chris */ 758 /* XXX: do we need to check rec_len == 0 case? -Chris */
735 de = ext4_next_entry(de); 759 de = ext4_next_entry(de, blocksize);
736 } 760 }
737 return count; 761 return count;
738} 762}
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
832 return 1; 856 return 1;
833 } 857 }
834 /* prevent looping on a bad block */ 858 /* prevent looping on a bad block */
835 de_len = ext4_rec_len_from_disk(de->rec_len); 859 de_len = ext4_rec_len_from_disk(de->rec_len,
860 dir->i_sb->s_blocksize);
836 if (de_len <= 0) 861 if (de_len <= 0)
837 return -1; 862 return -1;
838 offset += de_len; 863 offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
996 de = (struct ext4_dir_entry_2 *) bh->b_data; 1021 de = (struct ext4_dir_entry_2 *) bh->b_data;
997 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - 1022 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
998 EXT4_DIR_REC_LEN(0)); 1023 EXT4_DIR_REC_LEN(0));
999 for (; de < top; de = ext4_next_entry(de)) { 1024 for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
1000 int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) 1025 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
1001 + ((char *) de - bh->b_data); 1026 + ((char *) de - bh->b_data);
1002 1027
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1052 return ERR_PTR(-EIO); 1077 return ERR_PTR(-EIO);
1053 } 1078 }
1054 inode = ext4_iget(dir->i_sb, ino); 1079 inode = ext4_iget(dir->i_sb, ino);
1055 if (IS_ERR(inode)) 1080 if (unlikely(IS_ERR(inode))) {
1056 return ERR_CAST(inode); 1081 if (PTR_ERR(inode) == -ESTALE) {
1082 ext4_error(dir->i_sb, __func__,
1083 "deleted inode referenced: %u",
1084 ino);
1085 return ERR_PTR(-EIO);
1086 } else {
1087 return ERR_CAST(inode);
1088 }
1089 }
1057 } 1090 }
1058 return d_splice_alias(inode, dentry); 1091 return d_splice_alias(inode, dentry);
1059} 1092}
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
1109 * Returns pointer to last entry moved. 1142 * Returns pointer to last entry moved.
1110 */ 1143 */
1111static struct ext4_dir_entry_2 * 1144static struct ext4_dir_entry_2 *
1112dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) 1145dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1146 unsigned blocksize)
1113{ 1147{
1114 unsigned rec_len = 0; 1148 unsigned rec_len = 0;
1115 1149
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1118 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1152 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1119 memcpy (to, de, rec_len); 1153 memcpy (to, de, rec_len);
1120 ((struct ext4_dir_entry_2 *) to)->rec_len = 1154 ((struct ext4_dir_entry_2 *) to)->rec_len =
1121 ext4_rec_len_to_disk(rec_len); 1155 ext4_rec_len_to_disk(rec_len, blocksize);
1122 de->inode = 0; 1156 de->inode = 0;
1123 map++; 1157 map++;
1124 to += rec_len; 1158 to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1130 * Compact each dir entry in the range to the minimal rec_len. 1164 * Compact each dir entry in the range to the minimal rec_len.
1131 * Returns pointer to last entry in range. 1165 * Returns pointer to last entry in range.
1132 */ 1166 */
1133static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size) 1167static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
1134{ 1168{
1135 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; 1169 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1136 unsigned rec_len = 0; 1170 unsigned rec_len = 0;
1137 1171
1138 prev = to = de; 1172 prev = to = de;
1139 while ((char*)de < base + size) { 1173 while ((char*)de < base + blocksize) {
1140 next = ext4_next_entry(de); 1174 next = ext4_next_entry(de, blocksize);
1141 if (de->inode && de->name_len) { 1175 if (de->inode && de->name_len) {
1142 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1176 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1143 if (de > to) 1177 if (de > to)
1144 memmove(to, de, rec_len); 1178 memmove(to, de, rec_len);
1145 to->rec_len = ext4_rec_len_to_disk(rec_len); 1179 to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
1146 prev = to; 1180 prev = to;
1147 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); 1181 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1148 } 1182 }
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1215 hash2, split, count-split)); 1249 hash2, split, count-split));
1216 1250
1217 /* Fancy dance to stay within two buffers */ 1251 /* Fancy dance to stay within two buffers */
1218 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1252 de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1219 de = dx_pack_dirents(data1, blocksize); 1253 de = dx_pack_dirents(data1, blocksize);
1220 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1254 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
1221 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); 1255 blocksize);
1256 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
1257 blocksize);
1222 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1258 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1223 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1259 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1224 1260
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1268 const char *name = dentry->d_name.name; 1304 const char *name = dentry->d_name.name;
1269 int namelen = dentry->d_name.len; 1305 int namelen = dentry->d_name.len;
1270 unsigned int offset = 0; 1306 unsigned int offset = 0;
1307 unsigned int blocksize = dir->i_sb->s_blocksize;
1271 unsigned short reclen; 1308 unsigned short reclen;
1272 int nlen, rlen, err; 1309 int nlen, rlen, err;
1273 char *top; 1310 char *top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1275 reclen = EXT4_DIR_REC_LEN(namelen); 1312 reclen = EXT4_DIR_REC_LEN(namelen);
1276 if (!de) { 1313 if (!de) {
1277 de = (struct ext4_dir_entry_2 *)bh->b_data; 1314 de = (struct ext4_dir_entry_2 *)bh->b_data;
1278 top = bh->b_data + dir->i_sb->s_blocksize - reclen; 1315 top = bh->b_data + blocksize - reclen;
1279 while ((char *) de <= top) { 1316 while ((char *) de <= top) {
1280 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1317 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1281 bh, offset)) { 1318 bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1287 return -EEXIST; 1324 return -EEXIST;
1288 } 1325 }
1289 nlen = EXT4_DIR_REC_LEN(de->name_len); 1326 nlen = EXT4_DIR_REC_LEN(de->name_len);
1290 rlen = ext4_rec_len_from_disk(de->rec_len); 1327 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1291 if ((de->inode? rlen - nlen: rlen) >= reclen) 1328 if ((de->inode? rlen - nlen: rlen) >= reclen)
1292 break; 1329 break;
1293 de = (struct ext4_dir_entry_2 *)((char *)de + rlen); 1330 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1306 1343
1307 /* By now the buffer is marked for journaling */ 1344 /* By now the buffer is marked for journaling */
1308 nlen = EXT4_DIR_REC_LEN(de->name_len); 1345 nlen = EXT4_DIR_REC_LEN(de->name_len);
1309 rlen = ext4_rec_len_from_disk(de->rec_len); 1346 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1310 if (de->inode) { 1347 if (de->inode) {
1311 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); 1348 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1312 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen); 1349 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
1313 de->rec_len = ext4_rec_len_to_disk(nlen); 1350 de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1314 de = de1; 1351 de = de1;
1315 } 1352 }
1316 de->file_type = EXT4_FT_UNKNOWN; 1353 de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1380 /* The 0th block becomes the root, move the dirents out */ 1417 /* The 0th block becomes the root, move the dirents out */
1381 fde = &root->dotdot; 1418 fde = &root->dotdot;
1382 de = (struct ext4_dir_entry_2 *)((char *)fde + 1419 de = (struct ext4_dir_entry_2 *)((char *)fde +
1383 ext4_rec_len_from_disk(fde->rec_len)); 1420 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1384 if ((char *) de >= (((char *) root) + blocksize)) { 1421 if ((char *) de >= (((char *) root) + blocksize)) {
1385 ext4_error(dir->i_sb, __func__, 1422 ext4_error(dir->i_sb, __func__,
1386 "invalid rec_len for '..' in inode %lu", 1423 "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1402 memcpy (data1, de, len); 1439 memcpy (data1, de, len);
1403 de = (struct ext4_dir_entry_2 *) data1; 1440 de = (struct ext4_dir_entry_2 *) data1;
1404 top = data1 + len; 1441 top = data1 + len;
1405 while ((char *)(de2 = ext4_next_entry(de)) < top) 1442 while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
1406 de = de2; 1443 de = de2;
1407 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1444 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
1445 blocksize);
1408 /* Initialize the root; the dot dirents already exist */ 1446 /* Initialize the root; the dot dirents already exist */
1409 de = (struct ext4_dir_entry_2 *) (&root->dotdot); 1447 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1410 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2)); 1448 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
1449 blocksize);
1411 memset (&root->info, 0, sizeof(root->info)); 1450 memset (&root->info, 0, sizeof(root->info));
1412 root->info.info_length = sizeof(root->info); 1451 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1452 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1488 return retval; 1527 return retval;
1489 de = (struct ext4_dir_entry_2 *) bh->b_data; 1528 de = (struct ext4_dir_entry_2 *) bh->b_data;
1490 de->inode = 0; 1529 de->inode = 0;
1491 de->rec_len = ext4_rec_len_to_disk(blocksize); 1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1492 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1531 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1493} 1532}
1494 1533
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1551 goto cleanup; 1590 goto cleanup;
1552 node2 = (struct dx_node *)(bh2->b_data); 1591 node2 = (struct dx_node *)(bh2->b_data);
1553 entries2 = node2->entries; 1592 entries2 = node2->entries;
1554 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize); 1593 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1594 sb->s_blocksize);
1555 node2->fake.inode = 0; 1595 node2->fake.inode = 0;
1556 BUFFER_TRACE(frame->bh, "get_write_access"); 1596 BUFFER_TRACE(frame->bh, "get_write_access");
1557 err = ext4_journal_get_write_access(handle, frame->bh); 1597 err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
1639 struct buffer_head *bh) 1679 struct buffer_head *bh)
1640{ 1680{
1641 struct ext4_dir_entry_2 *de, *pde; 1681 struct ext4_dir_entry_2 *de, *pde;
1682 unsigned int blocksize = dir->i_sb->s_blocksize;
1642 int i; 1683 int i;
1643 1684
1644 i = 0; 1685 i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
1652 ext4_journal_get_write_access(handle, bh); 1693 ext4_journal_get_write_access(handle, bh);
1653 if (pde) 1694 if (pde)
1654 pde->rec_len = ext4_rec_len_to_disk( 1695 pde->rec_len = ext4_rec_len_to_disk(
1655 ext4_rec_len_from_disk(pde->rec_len) + 1696 ext4_rec_len_from_disk(pde->rec_len,
1656 ext4_rec_len_from_disk(de->rec_len)); 1697 blocksize) +
1698 ext4_rec_len_from_disk(de->rec_len,
1699 blocksize),
1700 blocksize);
1657 else 1701 else
1658 de->inode = 0; 1702 de->inode = 0;
1659 dir->i_version++; 1703 dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
1661 ext4_handle_dirty_metadata(handle, dir, bh); 1705 ext4_handle_dirty_metadata(handle, dir, bh);
1662 return 0; 1706 return 0;
1663 } 1707 }
1664 i += ext4_rec_len_from_disk(de->rec_len); 1708 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
1665 pde = de; 1709 pde = de;
1666 de = ext4_next_entry(de); 1710 de = ext4_next_entry(de, blocksize);
1667 } 1711 }
1668 return -ENOENT; 1712 return -ENOENT;
1669} 1713}
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1793 struct inode *inode; 1837 struct inode *inode;
1794 struct buffer_head *dir_block; 1838 struct buffer_head *dir_block;
1795 struct ext4_dir_entry_2 *de; 1839 struct ext4_dir_entry_2 *de;
1840 unsigned int blocksize = dir->i_sb->s_blocksize;
1796 int err, retries = 0; 1841 int err, retries = 0;
1797 1842
1798 if (EXT4_DIR_LINK_MAX(dir)) 1843 if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
1824 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1869 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1825 de->inode = cpu_to_le32(inode->i_ino); 1870 de->inode = cpu_to_le32(inode->i_ino);
1826 de->name_len = 1; 1871 de->name_len = 1;
1827 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); 1872 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
1873 blocksize);
1828 strcpy(de->name, "."); 1874 strcpy(de->name, ".");
1829 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1875 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1830 de = ext4_next_entry(de); 1876 de = ext4_next_entry(de, blocksize);
1831 de->inode = cpu_to_le32(dir->i_ino); 1877 de->inode = cpu_to_le32(dir->i_ino);
1832 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - 1878 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
1833 EXT4_DIR_REC_LEN(1)); 1879 blocksize);
1834 de->name_len = 2; 1880 de->name_len = 2;
1835 strcpy(de->name, ".."); 1881 strcpy(de->name, "..");
1836 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1882 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
1885 return 1; 1931 return 1;
1886 } 1932 }
1887 de = (struct ext4_dir_entry_2 *) bh->b_data; 1933 de = (struct ext4_dir_entry_2 *) bh->b_data;
1888 de1 = ext4_next_entry(de); 1934 de1 = ext4_next_entry(de, sb->s_blocksize);
1889 if (le32_to_cpu(de->inode) != inode->i_ino || 1935 if (le32_to_cpu(de->inode) != inode->i_ino ||
1890 !le32_to_cpu(de1->inode) || 1936 !le32_to_cpu(de1->inode) ||
1891 strcmp(".", de->name) || 1937 strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
1896 brelse(bh); 1942 brelse(bh);
1897 return 1; 1943 return 1;
1898 } 1944 }
1899 offset = ext4_rec_len_from_disk(de->rec_len) + 1945 offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
1900 ext4_rec_len_from_disk(de1->rec_len); 1946 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
1901 de = ext4_next_entry(de1); 1947 de = ext4_next_entry(de1, sb->s_blocksize);
1902 while (offset < inode->i_size) { 1948 while (offset < inode->i_size) {
1903 if (!bh || 1949 if (!bh ||
1904 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1950 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
1927 brelse(bh); 1973 brelse(bh);
1928 return 0; 1974 return 0;
1929 } 1975 }
1930 offset += ext4_rec_len_from_disk(de->rec_len); 1976 offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
1931 de = ext4_next_entry(de); 1977 de = ext4_next_entry(de, sb->s_blocksize);
1932 } 1978 }
1933 brelse(bh); 1979 brelse(bh);
1934 return 1; 1980 return 1;
@@ -2297,8 +2343,8 @@ retry:
2297 return err; 2343 return err;
2298} 2344}
2299 2345
2300#define PARENT_INO(buffer) \ 2346#define PARENT_INO(buffer, size) \
2301 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode) 2347 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
2302 2348
2303/* 2349/*
2304 * Anybody can rename anything with this: the permission checks are left to the 2350 * Anybody can rename anything with this: the permission checks are left to the
@@ -2311,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2311 struct inode *old_inode, *new_inode; 2357 struct inode *old_inode, *new_inode;
2312 struct buffer_head *old_bh, *new_bh, *dir_bh; 2358 struct buffer_head *old_bh, *new_bh, *dir_bh;
2313 struct ext4_dir_entry_2 *old_de, *new_de; 2359 struct ext4_dir_entry_2 *old_de, *new_de;
2314 int retval; 2360 int retval, force_da_alloc = 0;
2315 2361
2316 old_bh = new_bh = dir_bh = NULL; 2362 old_bh = new_bh = dir_bh = NULL;
2317 2363
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2358 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); 2404 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2359 if (!dir_bh) 2405 if (!dir_bh)
2360 goto end_rename; 2406 goto end_rename;
2361 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) 2407 if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
2408 old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2362 goto end_rename; 2409 goto end_rename;
2363 retval = -EMLINK; 2410 retval = -EMLINK;
2364 if (!new_inode && new_dir != old_dir && 2411 if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2430 if (dir_bh) { 2477 if (dir_bh) {
2431 BUFFER_TRACE(dir_bh, "get_write_access"); 2478 BUFFER_TRACE(dir_bh, "get_write_access");
2432 ext4_journal_get_write_access(handle, dir_bh); 2479 ext4_journal_get_write_access(handle, dir_bh);
2433 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2480 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2481 cpu_to_le32(new_dir->i_ino);
2434 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2482 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2435 ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2483 ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2436 ext4_dec_count(handle, old_dir); 2484 ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2449 ext4_mark_inode_dirty(handle, new_inode); 2497 ext4_mark_inode_dirty(handle, new_inode);
2450 if (!new_inode->i_nlink) 2498 if (!new_inode->i_nlink)
2451 ext4_orphan_add(handle, new_inode); 2499 ext4_orphan_add(handle, new_inode);
2500 if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
2501 force_da_alloc = 1;
2452 } 2502 }
2453 retval = 0; 2503 retval = 0;
2454 2504
@@ -2457,6 +2507,8 @@ end_rename:
2457 brelse(old_bh); 2507 brelse(old_bh);
2458 brelse(new_bh); 2508 brelse(new_bh);
2459 ext4_journal_stop(handle); 2509 ext4_journal_stop(handle);
2510 if (retval == 0 && force_da_alloc)
2511 ext4_alloc_da_blocks(old_inode);
2460 return retval; 2512 return retval;
2461} 2513}
2462 2514
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd65..546c7dd869e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
938 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 938 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
939 ext4_group_t flex_group; 939 ext4_group_t flex_group;
940 flex_group = ext4_flex_group(sbi, input->group); 940 flex_group = ext4_flex_group(sbi, input->group);
941 sbi->s_flex_groups[flex_group].free_blocks += 941 atomic_add(input->free_blocks_count,
942 input->free_blocks_count; 942 &sbi->s_flex_groups[flex_group].free_blocks);
943 sbi->s_flex_groups[flex_group].free_inodes += 943 atomic_add(EXT4_INODES_PER_GROUP(sb),
944 EXT4_INODES_PER_GROUP(sb); 944 &sbi->s_flex_groups[flex_group].free_inodes);
945 } 945 }
946 946
947 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 947 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7371a6a923..9987bba99db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
35#include <linux/quotaops.h> 35#include <linux/quotaops.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/ctype.h>
38#include <linux/marker.h> 39#include <linux/marker.h>
39#include <linux/log2.h> 40#include <linux/log2.h>
40#include <linux/crc16.h> 41#include <linux/crc16.h>
@@ -48,6 +49,7 @@
48#include "group.h" 49#include "group.h"
49 50
50struct proc_dir_entry *ext4_proc_root; 51struct proc_dir_entry *ext4_proc_root;
52static struct kset *ext4_kset;
51 53
52static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
53 unsigned long journal_devnum); 55 unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
577 ext4_commit_super(sb, es, 1); 579 ext4_commit_super(sb, es, 1);
578 } 580 }
579 if (sbi->s_proc) { 581 if (sbi->s_proc) {
580 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
581 remove_proc_entry(sb->s_id, ext4_proc_root); 582 remove_proc_entry(sb->s_id, ext4_proc_root);
582 } 583 }
584 kobject_del(&sbi->s_kobj);
583 585
584 for (i = 0; i < sbi->s_gdb_count; i++) 586 for (i = 0; i < sbi->s_gdb_count; i++)
585 brelse(sbi->s_group_desc[i]); 587 brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
615 ext4_blkdev_remove(sbi); 617 ext4_blkdev_remove(sbi);
616 } 618 }
617 sb->s_fs_info = NULL; 619 sb->s_fs_info = NULL;
620 /*
621 * Now that we are completely done shutting down the
622 * superblock, we need to actually destroy the kobject.
623 */
624 unlock_kernel();
625 unlock_super(sb);
626 kobject_put(&sbi->s_kobj);
627 wait_for_completion(&sbi->s_kobj_unregister);
628 lock_super(sb);
629 lock_kernel();
630 kfree(sbi->s_blockgroup_lock);
618 kfree(sbi); 631 kfree(sbi);
619 return; 632 return;
620} 633}
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
803 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 816 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
804 seq_puts(seq, ",noacl"); 817 seq_puts(seq, ",noacl");
805#endif 818#endif
806 if (!test_opt(sb, RESERVATION))
807 seq_puts(seq, ",noreservation");
808 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { 819 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
809 seq_printf(seq, ",commit=%u", 820 seq_printf(seq, ",commit=%u",
810 (unsigned) (sbi->s_commit_interval / HZ)); 821 (unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
855 if (test_opt(sb, DATA_ERR_ABORT)) 866 if (test_opt(sb, DATA_ERR_ABORT))
856 seq_puts(seq, ",data_err=abort"); 867 seq_puts(seq, ",data_err=abort");
857 868
869 if (test_opt(sb, NO_AUTO_DA_ALLOC))
870 seq_puts(seq, ",noauto_da_alloc");
871
858 ext4_show_quota_options(seq, sb); 872 ext4_show_quota_options(seq, sb);
859 return 0; 873 return 0;
860} 874}
@@ -1004,7 +1018,7 @@ enum {
1004 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 1018 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1005 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1019 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
1006 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1020 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1007 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 1021 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
1008 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 1022 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1009 Opt_journal_update, Opt_journal_dev, 1023 Opt_journal_update, Opt_journal_dev,
1010 Opt_journal_checksum, Opt_journal_async_commit, 1024 Opt_journal_checksum, Opt_journal_async_commit,
@@ -1012,8 +1026,8 @@ enum {
1012 Opt_data_err_abort, Opt_data_err_ignore, 1026 Opt_data_err_abort, Opt_data_err_ignore,
1013 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1027 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1014 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1028 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1015 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 1029 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
1016 Opt_grpquota, Opt_i_version, 1030 Opt_usrquota, Opt_grpquota, Opt_i_version,
1017 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1031 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1018 Opt_inode_readahead_blks, Opt_journal_ioprio 1032 Opt_inode_readahead_blks, Opt_journal_ioprio
1019}; 1033};
@@ -1039,8 +1053,6 @@ static const match_table_t tokens = {
1039 {Opt_nouser_xattr, "nouser_xattr"}, 1053 {Opt_nouser_xattr, "nouser_xattr"},
1040 {Opt_acl, "acl"}, 1054 {Opt_acl, "acl"},
1041 {Opt_noacl, "noacl"}, 1055 {Opt_noacl, "noacl"},
1042 {Opt_reservation, "reservation"},
1043 {Opt_noreservation, "noreservation"},
1044 {Opt_noload, "noload"}, 1056 {Opt_noload, "noload"},
1045 {Opt_nobh, "nobh"}, 1057 {Opt_nobh, "nobh"},
1046 {Opt_bh, "bh"}, 1058 {Opt_bh, "bh"},
@@ -1068,6 +1080,8 @@ static const match_table_t tokens = {
1068 {Opt_quota, "quota"}, 1080 {Opt_quota, "quota"},
1069 {Opt_usrquota, "usrquota"}, 1081 {Opt_usrquota, "usrquota"},
1070 {Opt_barrier, "barrier=%u"}, 1082 {Opt_barrier, "barrier=%u"},
1083 {Opt_barrier, "barrier"},
1084 {Opt_nobarrier, "nobarrier"},
1071 {Opt_i_version, "i_version"}, 1085 {Opt_i_version, "i_version"},
1072 {Opt_stripe, "stripe=%u"}, 1086 {Opt_stripe, "stripe=%u"},
1073 {Opt_resize, "resize"}, 1087 {Opt_resize, "resize"},
@@ -1075,6 +1089,9 @@ static const match_table_t tokens = {
1075 {Opt_nodelalloc, "nodelalloc"}, 1089 {Opt_nodelalloc, "nodelalloc"},
1076 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1090 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1077 {Opt_journal_ioprio, "journal_ioprio=%u"}, 1091 {Opt_journal_ioprio, "journal_ioprio=%u"},
1092 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1093 {Opt_auto_da_alloc, "auto_da_alloc"},
1094 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1078 {Opt_err, NULL}, 1095 {Opt_err, NULL},
1079}; 1096};
1080 1097
@@ -1207,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
1207 "not supported\n"); 1224 "not supported\n");
1208 break; 1225 break;
1209#endif 1226#endif
1210 case Opt_reservation:
1211 set_opt(sbi->s_mount_opt, RESERVATION);
1212 break;
1213 case Opt_noreservation:
1214 clear_opt(sbi->s_mount_opt, RESERVATION);
1215 break;
1216 case Opt_journal_update: 1227 case Opt_journal_update:
1217 /* @@@ FIXME */ 1228 /* @@@ FIXME */
1218 /* Eventually we will want to be able to create 1229 /* Eventually we will want to be able to create
@@ -1415,9 +1426,14 @@ set_qf_format:
1415 case Opt_abort: 1426 case Opt_abort:
1416 set_opt(sbi->s_mount_opt, ABORT); 1427 set_opt(sbi->s_mount_opt, ABORT);
1417 break; 1428 break;
1429 case Opt_nobarrier:
1430 clear_opt(sbi->s_mount_opt, BARRIER);
1431 break;
1418 case Opt_barrier: 1432 case Opt_barrier:
1419 if (match_int(&args[0], &option)) 1433 if (match_int(&args[0], &option)) {
1420 return 0; 1434 set_opt(sbi->s_mount_opt, BARRIER);
1435 break;
1436 }
1421 if (option) 1437 if (option)
1422 set_opt(sbi->s_mount_opt, BARRIER); 1438 set_opt(sbi->s_mount_opt, BARRIER);
1423 else 1439 else
@@ -1463,6 +1479,11 @@ set_qf_format:
1463 return 0; 1479 return 0;
1464 if (option < 0 || option > (1 << 30)) 1480 if (option < 0 || option > (1 << 30))
1465 return 0; 1481 return 0;
1482 if (option & (option - 1)) {
1483 printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
1484 " must be a power of 2\n");
1485 return 0;
1486 }
1466 sbi->s_inode_readahead_blks = option; 1487 sbi->s_inode_readahead_blks = option;
1467 break; 1488 break;
1468 case Opt_journal_ioprio: 1489 case Opt_journal_ioprio:
@@ -1473,6 +1494,19 @@ set_qf_format:
1473 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 1494 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1474 option); 1495 option);
1475 break; 1496 break;
1497 case Opt_noauto_da_alloc:
1498 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1499 break;
1500 case Opt_auto_da_alloc:
1501 if (match_int(&args[0], &option)) {
1502 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1503 break;
1504 }
1505 if (option)
1506 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1507 else
1508 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1509 break;
1476 default: 1510 default:
1477 printk(KERN_ERR 1511 printk(KERN_ERR
1478 "EXT4-fs: Unrecognized mount option \"%s\" " 1512 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1612,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
1612 gdp = ext4_get_group_desc(sb, i, &bh); 1646 gdp = ext4_get_group_desc(sb, i, &bh);
1613 1647
1614 flex_group = ext4_flex_group(sbi, i); 1648 flex_group = ext4_flex_group(sbi, i);
1615 sbi->s_flex_groups[flex_group].free_inodes += 1649 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
1616 ext4_free_inodes_count(sb, gdp); 1650 ext4_free_inodes_count(sb, gdp));
1617 sbi->s_flex_groups[flex_group].free_blocks += 1651 atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
1618 ext4_free_blks_count(sb, gdp); 1652 ext4_free_blks_count(sb, gdp));
1653 atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
1654 ext4_used_dirs_count(sb, gdp));
1619 } 1655 }
1620 1656
1621 return 1; 1657 return 1;
@@ -1991,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1991 return 0; 2027 return 0;
1992} 2028}
1993 2029
2030/* sysfs supprt */
2031
2032struct ext4_attr {
2033 struct attribute attr;
2034 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2035 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2036 const char *, size_t);
2037 int offset;
2038};
2039
2040static int parse_strtoul(const char *buf,
2041 unsigned long max, unsigned long *value)
2042{
2043 char *endp;
2044
2045 while (*buf && isspace(*buf))
2046 buf++;
2047 *value = simple_strtoul(buf, &endp, 0);
2048 while (*endp && isspace(*endp))
2049 endp++;
2050 if (*endp || *value > max)
2051 return -EINVAL;
2052
2053 return 0;
2054}
2055
2056static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
2057 struct ext4_sb_info *sbi,
2058 char *buf)
2059{
2060 return snprintf(buf, PAGE_SIZE, "%llu\n",
2061 (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2062}
2063
2064static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2065 struct ext4_sb_info *sbi, char *buf)
2066{
2067 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2068
2069 return snprintf(buf, PAGE_SIZE, "%lu\n",
2070 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2071 sbi->s_sectors_written_start) >> 1);
2072}
2073
2074static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2075 struct ext4_sb_info *sbi, char *buf)
2076{
2077 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2078
2079 return snprintf(buf, PAGE_SIZE, "%llu\n",
2080 sbi->s_kbytes_written +
2081 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2082 EXT4_SB(sb)->s_sectors_written_start) >> 1));
2083}
2084
2085static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2086 struct ext4_sb_info *sbi,
2087 const char *buf, size_t count)
2088{
2089 unsigned long t;
2090
2091 if (parse_strtoul(buf, 0x40000000, &t))
2092 return -EINVAL;
2093
2094 /* inode_readahead_blks must be a power of 2 */
2095 if (t & (t-1))
2096 return -EINVAL;
2097
2098 sbi->s_inode_readahead_blks = t;
2099 return count;
2100}
2101
2102static ssize_t sbi_ui_show(struct ext4_attr *a,
2103 struct ext4_sb_info *sbi, char *buf)
2104{
2105 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2106
2107 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2108}
2109
2110static ssize_t sbi_ui_store(struct ext4_attr *a,
2111 struct ext4_sb_info *sbi,
2112 const char *buf, size_t count)
2113{
2114 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2115 unsigned long t;
2116
2117 if (parse_strtoul(buf, 0xffffffff, &t))
2118 return -EINVAL;
2119 *ui = t;
2120 return count;
2121}
2122
2123#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2124static struct ext4_attr ext4_attr_##_name = { \
2125 .attr = {.name = __stringify(_name), .mode = _mode }, \
2126 .show = _show, \
2127 .store = _store, \
2128 .offset = offsetof(struct ext4_sb_info, _elname), \
2129}
2130#define EXT4_ATTR(name, mode, show, store) \
2131static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2132
2133#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2134#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2135#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2136 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2137#define ATTR_LIST(name) &ext4_attr_##name.attr
2138
2139EXT4_RO_ATTR(delayed_allocation_blocks);
2140EXT4_RO_ATTR(session_write_kbytes);
2141EXT4_RO_ATTR(lifetime_write_kbytes);
2142EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2143 inode_readahead_blks_store, s_inode_readahead_blks);
2144EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
2145EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
2146EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2147EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2148EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2149EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2150
2151static struct attribute *ext4_attrs[] = {
2152 ATTR_LIST(delayed_allocation_blocks),
2153 ATTR_LIST(session_write_kbytes),
2154 ATTR_LIST(lifetime_write_kbytes),
2155 ATTR_LIST(inode_readahead_blks),
2156 ATTR_LIST(mb_stats),
2157 ATTR_LIST(mb_max_to_scan),
2158 ATTR_LIST(mb_min_to_scan),
2159 ATTR_LIST(mb_order2_req),
2160 ATTR_LIST(mb_stream_req),
2161 ATTR_LIST(mb_group_prealloc),
2162 NULL,
2163};
2164
2165static ssize_t ext4_attr_show(struct kobject *kobj,
2166 struct attribute *attr, char *buf)
2167{
2168 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2169 s_kobj);
2170 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2171
2172 return a->show ? a->show(a, sbi, buf) : 0;
2173}
2174
2175static ssize_t ext4_attr_store(struct kobject *kobj,
2176 struct attribute *attr,
2177 const char *buf, size_t len)
2178{
2179 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2180 s_kobj);
2181 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2182
2183 return a->store ? a->store(a, sbi, buf, len) : 0;
2184}
2185
2186static void ext4_sb_release(struct kobject *kobj)
2187{
2188 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2189 s_kobj);
2190 complete(&sbi->s_kobj_unregister);
2191}
2192
2193
2194static struct sysfs_ops ext4_attr_ops = {
2195 .show = ext4_attr_show,
2196 .store = ext4_attr_store,
2197};
2198
2199static struct kobj_type ext4_ktype = {
2200 .default_attrs = ext4_attrs,
2201 .sysfs_ops = &ext4_attr_ops,
2202 .release = ext4_sb_release,
2203};
2204
1994static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2205static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1995 __releases(kernel_lock) 2206 __releases(kernel_lock)
1996 __acquires(kernel_lock) 2207 __acquires(kernel_lock)
@@ -2021,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2021 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2232 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2022 if (!sbi) 2233 if (!sbi)
2023 return -ENOMEM; 2234 return -ENOMEM;
2235
2236 sbi->s_blockgroup_lock =
2237 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
2238 if (!sbi->s_blockgroup_lock) {
2239 kfree(sbi);
2240 return -ENOMEM;
2241 }
2024 sb->s_fs_info = sbi; 2242 sb->s_fs_info = sbi;
2025 sbi->s_mount_opt = 0; 2243 sbi->s_mount_opt = 0;
2026 sbi->s_resuid = EXT4_DEF_RESUID; 2244 sbi->s_resuid = EXT4_DEF_RESUID;
2027 sbi->s_resgid = EXT4_DEF_RESGID; 2245 sbi->s_resgid = EXT4_DEF_RESGID;
2028 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 2246 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
2029 sbi->s_sb_block = sb_block; 2247 sbi->s_sb_block = sb_block;
2248 sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
2249 sectors[1]);
2030 2250
2031 unlock_kernel(); 2251 unlock_kernel();
2032 2252
@@ -2064,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2064 sb->s_magic = le16_to_cpu(es->s_magic); 2284 sb->s_magic = le16_to_cpu(es->s_magic);
2065 if (sb->s_magic != EXT4_SUPER_MAGIC) 2285 if (sb->s_magic != EXT4_SUPER_MAGIC)
2066 goto cantfind_ext4; 2286 goto cantfind_ext4;
2287 sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
2067 2288
2068 /* Set defaults before we parse the mount options */ 2289 /* Set defaults before we parse the mount options */
2069 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 2290 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2101,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2101 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2322 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2102 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2323 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2103 2324
2104 set_opt(sbi->s_mount_opt, RESERVATION);
2105 set_opt(sbi->s_mount_opt, BARRIER); 2325 set_opt(sbi->s_mount_opt, BARRIER);
2106 2326
2107 /* 2327 /*
@@ -2325,14 +2545,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2325#ifdef CONFIG_PROC_FS 2545#ifdef CONFIG_PROC_FS
2326 if (ext4_proc_root) 2546 if (ext4_proc_root)
2327 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 2547 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2328
2329 if (sbi->s_proc)
2330 proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2331 &ext4_ui_proc_fops,
2332 &sbi->s_inode_readahead_blks);
2333#endif 2548#endif
2334 2549
2335 bgl_lock_init(&sbi->s_blockgroup_lock); 2550 bgl_lock_init(sbi->s_blockgroup_lock);
2336 2551
2337 for (i = 0; i < db_count; i++) { 2552 for (i = 0; i < db_count; i++) {
2338 block = descriptor_loc(sb, logical_sb_block, i); 2553 block = descriptor_loc(sb, logical_sb_block, i);
@@ -2564,6 +2779,16 @@ no_journal:
2564 goto failed_mount4; 2779 goto failed_mount4;
2565 } 2780 }
2566 2781
2782 sbi->s_kobj.kset = ext4_kset;
2783 init_completion(&sbi->s_kobj_unregister);
2784 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
2785 "%s", sb->s_id);
2786 if (err) {
2787 ext4_mb_release(sb);
2788 ext4_ext_release(sb);
2789 goto failed_mount4;
2790 };
2791
2567 /* 2792 /*
2568 * akpm: core read_super() calls in here with the superblock locked. 2793 * akpm: core read_super() calls in here with the superblock locked.
2569 * That deadlocks, because orphan cleanup needs to lock the superblock 2794 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2618,7 +2843,6 @@ failed_mount2:
2618 kfree(sbi->s_group_desc); 2843 kfree(sbi->s_group_desc);
2619failed_mount: 2844failed_mount:
2620 if (sbi->s_proc) { 2845 if (sbi->s_proc) {
2621 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2622 remove_proc_entry(sb->s_id, ext4_proc_root); 2846 remove_proc_entry(sb->s_id, ext4_proc_root);
2623 } 2847 }
2624#ifdef CONFIG_QUOTA 2848#ifdef CONFIG_QUOTA
@@ -2913,6 +3137,10 @@ static int ext4_commit_super(struct super_block *sb,
2913 set_buffer_uptodate(sbh); 3137 set_buffer_uptodate(sbh);
2914 } 3138 }
2915 es->s_wtime = cpu_to_le32(get_seconds()); 3139 es->s_wtime = cpu_to_le32(get_seconds());
3140 es->s_kbytes_written =
3141 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3142 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3143 EXT4_SB(sb)->s_sectors_written_start) >> 1));
2916 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3144 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
2917 &EXT4_SB(sb)->s_freeblocks_counter)); 3145 &EXT4_SB(sb)->s_freeblocks_counter));
2918 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3146 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3647,45 +3875,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
3647 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3875 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3648} 3876}
3649 3877
3650#ifdef CONFIG_PROC_FS
3651static int ext4_ui_proc_show(struct seq_file *m, void *v)
3652{
3653 unsigned int *p = m->private;
3654
3655 seq_printf(m, "%u\n", *p);
3656 return 0;
3657}
3658
3659static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3660{
3661 return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
3662}
3663
3664static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3665 size_t cnt, loff_t *ppos)
3666{
3667 unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
3668 char str[32];
3669
3670 if (cnt >= sizeof(str))
3671 return -EINVAL;
3672 if (copy_from_user(str, buf, cnt))
3673 return -EFAULT;
3674
3675 *p = simple_strtoul(str, NULL, 0);
3676 return cnt;
3677}
3678
3679const struct file_operations ext4_ui_proc_fops = {
3680 .owner = THIS_MODULE,
3681 .open = ext4_ui_proc_open,
3682 .read = seq_read,
3683 .llseek = seq_lseek,
3684 .release = single_release,
3685 .write = ext4_ui_proc_write,
3686};
3687#endif
3688
3689static struct file_system_type ext4_fs_type = { 3878static struct file_system_type ext4_fs_type = {
3690 .owner = THIS_MODULE, 3879 .owner = THIS_MODULE,
3691 .name = "ext4", 3880 .name = "ext4",
@@ -3719,6 +3908,9 @@ static int __init init_ext4_fs(void)
3719{ 3908{
3720 int err; 3909 int err;
3721 3910
3911 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
3912 if (!ext4_kset)
3913 return -ENOMEM;
3722 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 3914 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3723 err = init_ext4_mballoc(); 3915 err = init_ext4_mballoc();
3724 if (err) 3916 if (err)
@@ -3760,6 +3952,7 @@ static void __exit exit_ext4_fs(void)
3760 exit_ext4_xattr(); 3952 exit_ext4_xattr();
3761 exit_ext4_mballoc(); 3953 exit_ext4_mballoc();
3762 remove_proc_entry("fs/ext4", NULL); 3954 remove_proc_entry("fs/ext4", NULL);
3955 kset_unregister(ext4_kset);
3763} 3956}
3764 3957
3765MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3958MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 821d10f719b..4e340fedf76 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
1234 * - sync(2) 1234 * - sync(2)
1235 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER 1235 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
1236 */ 1236 */
1237static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page) 1237static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1238{ 1238{
1239 struct page *page = vmf->page;
1239 /* 1240 /*
1240 * Don't use page->mapping as it may become NULL from a 1241 * Don't use page->mapping as it may become NULL from a
1241 * concurrent truncate. 1242 * concurrent truncate.
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3b9e8de3500..70b9b854894 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
337 * blocks allocated on disk to back that page. 337 * blocks allocated on disk to back that page.
338 */ 338 */
339 339
340static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) 340static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
341{ 341{
342 struct page *page = vmf->page;
342 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 343 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
343 struct gfs2_inode *ip = GFS2_I(inode); 344 struct gfs2_inode *ip = GFS2_I(inode);
344 struct gfs2_sbd *sdp = GFS2_SB(inode); 345 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -412,6 +413,8 @@ out_unlock:
412 gfs2_glock_dq(&gh); 413 gfs2_glock_dq(&gh);
413out: 414out:
414 gfs2_holder_uninit(&gh); 415 gfs2_holder_uninit(&gh);
416 if (ret)
417 ret = VM_FAULT_SIGBUS;
415 return ret; 418 return ret;
416} 419}
417 420
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9b800d97a68..23a3c76711e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -943,14 +943,13 @@ static struct vfsmount *hugetlbfs_vfsmount;
943 943
944static int can_do_hugetlb_shm(void) 944static int can_do_hugetlb_shm(void)
945{ 945{
946 return likely(capable(CAP_IPC_LOCK) || 946 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
947 in_group_p(sysctl_hugetlb_shm_group) ||
948 can_do_mlock());
949} 947}
950 948
951struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) 949struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
952{ 950{
953 int error = -ENOMEM; 951 int error = -ENOMEM;
952 int unlock_shm = 0;
954 struct file *file; 953 struct file *file;
955 struct inode *inode; 954 struct inode *inode;
956 struct dentry *dentry, *root; 955 struct dentry *dentry, *root;
@@ -960,11 +959,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
960 if (!hugetlbfs_vfsmount) 959 if (!hugetlbfs_vfsmount)
961 return ERR_PTR(-ENOENT); 960 return ERR_PTR(-ENOENT);
962 961
963 if (!can_do_hugetlb_shm()) 962 if (!can_do_hugetlb_shm()) {
964 return ERR_PTR(-EPERM); 963 if (user_shm_lock(size, user)) {
965 964 unlock_shm = 1;
966 if (!user_shm_lock(size, user)) 965 WARN_ONCE(1,
967 return ERR_PTR(-ENOMEM); 966 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
967 } else
968 return ERR_PTR(-EPERM);
969 }
968 970
969 root = hugetlbfs_vfsmount->mnt_root; 971 root = hugetlbfs_vfsmount->mnt_root;
970 quick_string.name = name; 972 quick_string.name = name;
@@ -1004,7 +1006,8 @@ out_inode:
1004out_dentry: 1006out_dentry:
1005 dput(dentry); 1007 dput(dentry);
1006out_shm_unlock: 1008out_shm_unlock:
1007 user_shm_unlock(size, user); 1009 if (unlock_shm)
1010 user_shm_unlock(size, user);
1008 return ERR_PTR(error); 1011 return ERR_PTR(error);
1009} 1012}
1010 1013
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44..4ea72377c7a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
367 int tag_bytes = journal_tag_bytes(journal); 367 int tag_bytes = journal_tag_bytes(journal);
368 struct buffer_head *cbh = NULL; /* For transactional checksums */ 368 struct buffer_head *cbh = NULL; /* For transactional checksums */
369 __u32 crc32_sum = ~0; 369 __u32 crc32_sum = ~0;
370 int write_op = WRITE;
370 371
371 /* 372 /*
372 * First job: lock down the current transaction and wait for 373 * First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
401 spin_lock(&journal->j_state_lock); 402 spin_lock(&journal->j_state_lock);
402 commit_transaction->t_state = T_LOCKED; 403 commit_transaction->t_state = T_LOCKED;
403 404
405 if (commit_transaction->t_synchronous_commit)
406 write_op = WRITE_SYNC;
404 stats.u.run.rs_wait = commit_transaction->t_max_wait; 407 stats.u.run.rs_wait = commit_transaction->t_max_wait;
405 stats.u.run.rs_locked = jiffies; 408 stats.u.run.rs_locked = jiffies;
406 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 409 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
680 clear_buffer_dirty(bh); 683 clear_buffer_dirty(bh);
681 set_buffer_uptodate(bh); 684 set_buffer_uptodate(bh);
682 bh->b_end_io = journal_end_buffer_io_sync; 685 bh->b_end_io = journal_end_buffer_io_sync;
683 submit_bh(WRITE, bh); 686 submit_bh(write_op, bh);
684 } 687 }
685 cond_resched(); 688 cond_resched();
686 stats.u.run.rs_blocks_logged += bufs; 689 stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff262576..bbe6d592d8b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
55 * need do nothing. 55 * need do nothing.
56 * RevokeValid set, Revoked set: 56 * RevokeValid set, Revoked set:
57 * buffer has been revoked. 57 * buffer has been revoked.
58 *
59 * Locking rules:
60 * We keep two hash tables of revoke records. One hashtable belongs to the
61 * running transaction (is pointed to by journal->j_revoke), the other one
62 * belongs to the committing transaction. Accesses to the second hash table
63 * happen only from the kjournald and no other thread touches this table. Also
64 * journal_switch_revoke_table() which switches which hashtable belongs to the
65 * running and which to the committing transaction is called only from
66 * kjournald. Therefore we need no locks when accessing the hashtable belonging
67 * to the committing transaction.
68 *
69 * All users operating on the hash table belonging to the running transaction
70 * have a handle to the transaction. Therefore they are safe from kjournald
71 * switching hash tables under them. For operations on the lists of entries in
72 * the hash table j_revoke_lock is used.
73 *
74 * Finally, also replay code uses the hash tables but at this moment noone else
75 * can touch them (filesystem isn't mounted yet) and hence no locking is
76 * needed.
58 */ 77 */
59 78
60#ifndef __KERNEL__ 79#ifndef __KERNEL__
@@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
401 * the second time we would still have a pending revoke to cancel. So, 420 * the second time we would still have a pending revoke to cancel. So,
402 * do not trust the Revoked bit on buffers unless RevokeValid is also 421 * do not trust the Revoked bit on buffers unless RevokeValid is also
403 * set. 422 * set.
404 *
405 * The caller must have the journal locked.
406 */ 423 */
407int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) 424int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
408{ 425{
@@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
480/* 497/*
481 * Write revoke records to the journal for all entries in the current 498 * Write revoke records to the journal for all entries in the current
482 * revoke hash, deleting the entries as we go. 499 * revoke hash, deleting the entries as we go.
483 *
484 * Called with the journal lock held.
485 */ 500 */
486
487void jbd2_journal_write_revoke_records(journal_t *journal, 501void jbd2_journal_write_revoke_records(journal_t *journal,
488 transaction_t *transaction) 502 transaction_t *transaction)
489{ 503{
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598..996ffda06bf 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
1315 } 1315 }
1316 } 1316 }
1317 1317
1318 if (handle->h_sync)
1319 transaction->t_synchronous_commit = 1;
1318 current->journal_info = NULL; 1320 current->journal_info = NULL;
1319 spin_lock(&journal->j_state_lock); 1321 spin_lock(&journal->j_state_lock);
1320 spin_lock(&transaction->t_handle_lock); 1322 spin_lock(&transaction->t_handle_lock);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index aedc47a264c..1f3b0fc0d35 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
139 return 0; 139 return 0;
140} 140}
141 141
142#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
143static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
144 struct in6_addr *addr_mapped)
145{
146 const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
147
148 switch (sap->sa_family) {
149 case AF_INET6:
150 return &((const struct sockaddr_in6 *)sap)->sin6_addr;
151 case AF_INET:
152 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
153 return addr_mapped;
154 }
155
156 return NULL;
157}
158
159/*
160 * If lockd is using a PF_INET6 listener, all incoming requests appear
161 * to come from AF_INET6 remotes. The address of AF_INET remotes are
162 * mapped to AF_INET6 automatically by the network layer. In case the
163 * user passed an AF_INET server address at mount time, ensure both
164 * addresses are AF_INET6 before comparing them.
165 */
166static int nlmclnt_cmp_addr(const struct nlm_host *host,
167 const struct sockaddr *sap)
168{
169 const struct in6_addr *addr1;
170 const struct in6_addr *addr2;
171 struct in6_addr addr1_mapped;
172 struct in6_addr addr2_mapped;
173
174 addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
175 if (likely(addr1 != NULL)) {
176 addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
177 if (likely(addr2 != NULL))
178 return ipv6_addr_equal(addr1, addr2);
179 }
180
181 return 0;
182}
183#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
184static int nlmclnt_cmp_addr(const struct nlm_host *host,
185 const struct sockaddr *sap)
186{
187 return nlm_cmp_addr(nlm_addr(host), sap);
188}
189#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
190
191/* 142/*
192 * The server lockd has called us back to tell us the lock was granted 143 * The server lockd has called us back to tell us the lock was granted
193 */ 144 */
@@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
215 */ 166 */
216 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
217 continue; 168 continue;
218 if (!nlmclnt_cmp_addr(block->b_host, addr)) 169 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
219 continue; 170 continue;
220 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
221 continue; 172 continue;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac82..6d5d4a4169e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18 18
19#include <asm/unaligned.h>
20
19#define NLMDBG_FACILITY NLMDBG_MONITOR 21#define NLMDBG_FACILITY NLMDBG_MONITOR
20#define NSM_PROGRAM 100024 22#define NSM_PROGRAM 100024
21#define NSM_VERSION 1 23#define NSM_VERSION 1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
274{ 276{
275 u64 *p = (u64 *)&nsm->sm_priv.data; 277 u64 *p = (u64 *)&nsm->sm_priv.data;
276 struct timespec ts; 278 struct timespec ts;
279 s64 ns;
277 280
278 ktime_get_ts(&ts); 281 ktime_get_ts(&ts);
279 *p++ = timespec_to_ns(&ts); 282 ns = timespec_to_ns(&ts);
280 *p = (unsigned long)nsm; 283 put_unaligned(ns, p);
284 put_unaligned((unsigned long)nsm, p + 1);
281} 285}
282 286
283static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, 287static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b585..abf83881f68 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,17 +53,6 @@ static struct svc_rqst *nlmsvc_rqst;
53unsigned long nlmsvc_timeout; 53unsigned long nlmsvc_timeout;
54 54
55/* 55/*
56 * If the kernel has IPv6 support available, always listen for
57 * both AF_INET and AF_INET6 requests.
58 */
59#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
60 defined(CONFIG_SUNRPC_REGISTER_V4)
61static const sa_family_t nlmsvc_family = AF_INET6;
62#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
63static const sa_family_t nlmsvc_family = AF_INET;
64#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
65
66/*
67 * These can be set at insmod time (useful for NFS as root filesystem), 56 * These can be set at insmod time (useful for NFS as root filesystem),
68 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 57 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
69 */ 58 */
@@ -204,19 +193,30 @@ lockd(void *vrqstp)
204 return 0; 193 return 0;
205} 194}
206 195
207static int create_lockd_listener(struct svc_serv *serv, char *name, 196static int create_lockd_listener(struct svc_serv *serv, const char *name,
208 unsigned short port) 197 const int family, const unsigned short port)
209{ 198{
210 struct svc_xprt *xprt; 199 struct svc_xprt *xprt;
211 200
212 xprt = svc_find_xprt(serv, name, 0, 0); 201 xprt = svc_find_xprt(serv, name, family, 0);
213 if (xprt == NULL) 202 if (xprt == NULL)
214 return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); 203 return svc_create_xprt(serv, name, family, port,
215 204 SVC_SOCK_DEFAULTS);
216 svc_xprt_put(xprt); 205 svc_xprt_put(xprt);
217 return 0; 206 return 0;
218} 207}
219 208
209static int create_lockd_family(struct svc_serv *serv, const int family)
210{
211 int err;
212
213 err = create_lockd_listener(serv, "udp", family, nlm_udpport);
214 if (err < 0)
215 return err;
216
217 return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
218}
219
220/* 220/*
221 * Ensure there are active UDP and TCP listeners for lockd. 221 * Ensure there are active UDP and TCP listeners for lockd.
222 * 222 *
@@ -232,13 +232,15 @@ static int make_socks(struct svc_serv *serv)
232 static int warned; 232 static int warned;
233 int err; 233 int err;
234 234
235 err = create_lockd_listener(serv, "udp", nlm_udpport); 235 err = create_lockd_family(serv, PF_INET);
236 if (err < 0) 236 if (err < 0)
237 goto out_err; 237 goto out_err;
238 238
239 err = create_lockd_listener(serv, "tcp", nlm_tcpport); 239#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
240 if (err < 0) 240 err = create_lockd_family(serv, PF_INET6);
241 if (err < 0 && err != -EAFNOSUPPORT)
241 goto out_err; 242 goto out_err;
243#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
242 244
243 warned = 0; 245 warned = 0;
244 return 0; 246 return 0;
@@ -274,7 +276,7 @@ int lockd_up(void)
274 "lockd_up: no pid, %d users??\n", nlmsvc_users); 276 "lockd_up: no pid, %d users??\n", nlmsvc_users);
275 277
276 error = -ENOMEM; 278 error = -ENOMEM;
277 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); 279 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
278 if (!serv) { 280 if (!serv) {
279 printk(KERN_WARNING "lockd_up: create service failed\n"); 281 printk(KERN_WARNING "lockd_up: create service failed\n");
280 goto out; 282 goto out;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a108..a886e692ddd 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program;
38 38
39unsigned int nfs_callback_set_tcpport; 39unsigned int nfs_callback_set_tcpport;
40unsigned short nfs_callback_tcpport; 40unsigned short nfs_callback_tcpport;
41unsigned short nfs_callback_tcpport6;
41static const int nfs_set_port_min = 0; 42static const int nfs_set_port_min = 0;
42static const int nfs_set_port_max = 65535; 43static const int nfs_set_port_max = 65535;
43 44
44/*
45 * If the kernel has IPv6 support available, always listen for
46 * both AF_INET and AF_INET6 requests.
47 */
48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49static const sa_family_t nfs_callback_family = AF_INET6;
50#else
51static const sa_family_t nfs_callback_family = AF_INET;
52#endif
53
54static int param_set_port(const char *val, struct kernel_param *kp) 45static int param_set_port(const char *val, struct kernel_param *kp)
55{ 46{
56 char *endp; 47 char *endp;
@@ -116,19 +107,29 @@ int nfs_callback_up(void)
116 mutex_lock(&nfs_callback_mutex); 107 mutex_lock(&nfs_callback_mutex);
117 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) 108 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
118 goto out; 109 goto out;
119 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, 110 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
120 nfs_callback_family, NULL);
121 ret = -ENOMEM; 111 ret = -ENOMEM;
122 if (!serv) 112 if (!serv)
123 goto out_err; 113 goto out_err;
124 114
125 ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, 115 ret = svc_create_xprt(serv, "tcp", PF_INET,
126 SVC_SOCK_ANONYMOUS); 116 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
127 if (ret <= 0) 117 if (ret <= 0)
128 goto out_err; 118 goto out_err;
129 nfs_callback_tcpport = ret; 119 nfs_callback_tcpport = ret;
130 dprintk("NFS: Callback listener port = %u (af %u)\n", 120 dprintk("NFS: Callback listener port = %u (af %u)\n",
131 nfs_callback_tcpport, nfs_callback_family); 121 nfs_callback_tcpport, PF_INET);
122
123#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
124 ret = svc_create_xprt(serv, "tcp", PF_INET6,
125 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
126 if (ret > 0) {
127 nfs_callback_tcpport6 = ret;
128 dprintk("NFS: Callback listener port = %u (af %u)\n",
129 nfs_callback_tcpport6, PF_INET6);
130 } else if (ret != -EAFNOSUPPORT)
131 goto out_err;
132#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
132 133
133 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); 134 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
134 if (IS_ERR(nfs_callback_info.rqst)) { 135 if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff..e110e286a26 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
72 72
73extern unsigned int nfs_callback_set_tcpport; 73extern unsigned int nfs_callback_set_tcpport;
74extern unsigned short nfs_callback_tcpport; 74extern unsigned short nfs_callback_tcpport;
75extern unsigned short nfs_callback_tcpport6;
75 76
76#endif /* __LINUX_FS_NFS_CALLBACK_H */ 77#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2277421656e..aba38017bde 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -224,38 +224,6 @@ void nfs_put_client(struct nfs_client *clp)
224} 224}
225 225
226#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 226#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
227static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
228{
229 switch (sa->sa_family) {
230 default:
231 return NULL;
232 case AF_INET6:
233 return &((const struct sockaddr_in6 *)sa)->sin6_addr;
234 break;
235 case AF_INET:
236 ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
237 addr_mapped);
238 return addr_mapped;
239 }
240}
241
242static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
243 const struct sockaddr *sa2)
244{
245 const struct in6_addr *addr1;
246 const struct in6_addr *addr2;
247 struct in6_addr addr1_mapped;
248 struct in6_addr addr2_mapped;
249
250 addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
251 if (likely(addr1 != NULL)) {
252 addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
253 if (likely(addr2 != NULL))
254 return ipv6_addr_equal(addr1, addr2);
255 }
256 return 0;
257}
258
259/* 227/*
260 * Test if two ip6 socket addresses refer to the same socket by 228 * Test if two ip6 socket addresses refer to the same socket by
261 * comparing relevant fields. The padding bytes specifically, are not 229 * comparing relevant fields. The padding bytes specifically, are not
@@ -267,38 +235,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
267 * 235 *
268 * The caller should ensure both socket addresses are AF_INET6. 236 * The caller should ensure both socket addresses are AF_INET6.
269 */ 237 */
270static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, 238static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
271 const struct sockaddr *sa2) 239 const struct sockaddr *sa2)
272{ 240{
273 const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; 241 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
274 const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; 242 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
275 243
276 if (!ipv6_addr_equal(&saddr1->sin6_addr, 244 if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
277 &saddr1->sin6_addr)) 245 sin1->sin6_scope_id != sin2->sin6_scope_id)
278 return 0; 246 return 0;
279 if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
280 saddr1->sin6_scope_id != saddr2->sin6_scope_id)
281 return 0;
282 return saddr1->sin6_port == saddr2->sin6_port;
283}
284#else
285static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
286 const struct sockaddr_in *sa2)
287{
288 return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
289}
290 247
291static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, 248 return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
292 const struct sockaddr *sa2)
293{
294 if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
295 return 0;
296 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
297 (const struct sockaddr_in *)sa2);
298} 249}
299 250#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
300static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, 251static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
301 const struct sockaddr * sa2) 252 const struct sockaddr *sa2)
302{ 253{
303 return 0; 254 return 0;
304} 255}
@@ -311,20 +262,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
311 * 262 *
312 * The caller should ensure both socket addresses are AF_INET. 263 * The caller should ensure both socket addresses are AF_INET.
313 */ 264 */
265static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
266 const struct sockaddr *sa2)
267{
268 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
269 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
270
271 return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
272}
273
274static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
275 const struct sockaddr *sa2)
276{
277 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
278 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
279
280 return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
281 (sin1->sin6_port == sin2->sin6_port);
282}
283
314static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, 284static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
315 const struct sockaddr *sa2) 285 const struct sockaddr *sa2)
316{ 286{
317 const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; 287 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
318 const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; 288 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
319 289
320 if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) 290 return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
291 (sin1->sin_port == sin2->sin_port);
292}
293
294/*
295 * Test if two socket addresses represent the same actual socket,
296 * by comparing (only) relevant fields, excluding the port number.
297 */
298static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
299 const struct sockaddr *sa2)
300{
301 if (sa1->sa_family != sa2->sa_family)
321 return 0; 302 return 0;
322 return saddr1->sin_port == saddr2->sin_port; 303
304 switch (sa1->sa_family) {
305 case AF_INET:
306 return nfs_sockaddr_match_ipaddr4(sa1, sa2);
307 case AF_INET6:
308 return nfs_sockaddr_match_ipaddr6(sa1, sa2);
309 }
310 return 0;
323} 311}
324 312
325/* 313/*
326 * Test if two socket addresses represent the same actual socket, 314 * Test if two socket addresses represent the same actual socket,
327 * by comparing (only) relevant fields. 315 * by comparing (only) relevant fields, including the port number.
328 */ 316 */
329static int nfs_sockaddr_cmp(const struct sockaddr *sa1, 317static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
330 const struct sockaddr *sa2) 318 const struct sockaddr *sa2)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 78bf72fc1db..370b190a09d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1624 } else if (atomic_read(&new_dentry->d_count) > 1) 1624 } else if (atomic_read(&new_dentry->d_count) > 1)
1625 /* dentry still busy? */ 1625 /* dentry still busy? */
1626 goto out; 1626 goto out;
1627 } else 1627 }
1628 nfs_drop_nlink(new_inode);
1629 1628
1630go_ahead: 1629go_ahead:
1631 /* 1630 /*
@@ -1638,10 +1637,8 @@ go_ahead:
1638 } 1637 }
1639 nfs_inode_return_delegation(old_inode); 1638 nfs_inode_return_delegation(old_inode);
1640 1639
1641 if (new_inode != NULL) { 1640 if (new_inode != NULL)
1642 nfs_inode_return_delegation(new_inode); 1641 nfs_inode_return_delegation(new_inode);
1643 d_delete(new_dentry);
1644 }
1645 1642
1646 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1643 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
1647 new_dir, &new_dentry->d_name); 1644 new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
1650 if (rehash) 1647 if (rehash)
1651 d_rehash(rehash); 1648 d_rehash(rehash);
1652 if (!error) { 1649 if (!error) {
1650 if (new_inode != NULL)
1651 nfs_drop_nlink(new_inode);
1653 d_move(old_dentry, new_dentry); 1652 d_move(old_dentry, new_dentry);
1654 nfs_set_verifier(new_dentry, 1653 nfs_set_verifier(new_dentry,
1655 nfs_save_change_attribute(new_dir)); 1654 nfs_save_change_attribute(new_dir));
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d..0abf3f331f5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -64,11 +64,7 @@ const struct file_operations nfs_file_operations = {
64 .write = do_sync_write, 64 .write = do_sync_write,
65 .aio_read = nfs_file_read, 65 .aio_read = nfs_file_read,
66 .aio_write = nfs_file_write, 66 .aio_write = nfs_file_write,
67#ifdef CONFIG_MMU
68 .mmap = nfs_file_mmap, 67 .mmap = nfs_file_mmap,
69#else
70 .mmap = generic_file_mmap,
71#endif
72 .open = nfs_file_open, 68 .open = nfs_file_open,
73 .flush = nfs_file_flush, 69 .flush = nfs_file_flush,
74 .release = nfs_file_release, 70 .release = nfs_file_release,
@@ -141,9 +137,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
141 dentry->d_parent->d_name.name, 137 dentry->d_parent->d_name.name,
142 dentry->d_name.name); 138 dentry->d_name.name);
143 139
144 /* Ensure that dirty pages are flushed out with the right creds */
145 if (filp->f_mode & FMODE_WRITE)
146 nfs_wb_all(dentry->d_inode);
147 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 140 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
148 return nfs_release(inode, filp); 141 return nfs_release(inode, filp);
149} 142}
@@ -235,7 +228,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
235 struct nfs_open_context *ctx = nfs_file_open_context(file); 228 struct nfs_open_context *ctx = nfs_file_open_context(file);
236 struct dentry *dentry = file->f_path.dentry; 229 struct dentry *dentry = file->f_path.dentry;
237 struct inode *inode = dentry->d_inode; 230 struct inode *inode = dentry->d_inode;
238 int status;
239 231
240 dprintk("NFS: flush(%s/%s)\n", 232 dprintk("NFS: flush(%s/%s)\n",
241 dentry->d_parent->d_name.name, 233 dentry->d_parent->d_name.name,
@@ -245,11 +237,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
245 return 0; 237 return 0;
246 nfs_inc_stats(inode, NFSIOS_VFSFLUSH); 238 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
247 239
248 /* Ensure that data+attribute caches are up to date after close() */ 240 /* Flush writes to the server and return any errors */
249 status = nfs_do_fsync(ctx, inode); 241 return nfs_do_fsync(ctx, inode);
250 if (!status)
251 nfs_revalidate_inode(NFS_SERVER(inode), inode);
252 return status;
253} 242}
254 243
255static ssize_t 244static ssize_t
@@ -304,11 +293,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
304 dprintk("NFS: mmap(%s/%s)\n", 293 dprintk("NFS: mmap(%s/%s)\n",
305 dentry->d_parent->d_name.name, dentry->d_name.name); 294 dentry->d_parent->d_name.name, dentry->d_name.name);
306 295
307 status = nfs_revalidate_mapping(inode, file->f_mapping); 296 /* Note: generic_file_mmap() returns ENOSYS on nommu systems
297 * so we call that before revalidating the mapping
298 */
299 status = generic_file_mmap(file, vma);
308 if (!status) { 300 if (!status) {
309 vma->vm_ops = &nfs_file_vm_ops; 301 vma->vm_ops = &nfs_file_vm_ops;
310 vma->vm_flags |= VM_CAN_NONLINEAR; 302 status = nfs_revalidate_mapping(inode, file->f_mapping);
311 file_accessed(file);
312 } 303 }
313 return status; 304 return status;
314} 305}
@@ -354,6 +345,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
354 file->f_path.dentry->d_name.name, 345 file->f_path.dentry->d_name.name,
355 mapping->host->i_ino, len, (long long) pos); 346 mapping->host->i_ino, len, (long long) pos);
356 347
348 /*
349 * Prevent starvation issues if someone is doing a consistency
350 * sync-to-disk
351 */
352 ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
353 nfs_wait_bit_killable, TASK_KILLABLE);
354 if (ret)
355 return ret;
356
357 page = grab_cache_page_write_begin(mapping, index, flags); 357 page = grab_cache_page_write_begin(mapping, index, flags);
358 if (!page) 358 if (!page)
359 return -ENOMEM; 359 return -ENOMEM;
@@ -451,8 +451,9 @@ const struct address_space_operations nfs_file_aops = {
451 .launder_page = nfs_launder_page, 451 .launder_page = nfs_launder_page,
452}; 452};
453 453
454static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 454static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
455{ 455{
456 struct page *page = vmf->page;
456 struct file *filp = vma->vm_file; 457 struct file *filp = vma->vm_file;
457 struct dentry *dentry = filp->f_path.dentry; 458 struct dentry *dentry = filp->f_path.dentry;
458 unsigned pagelen; 459 unsigned pagelen;
@@ -483,6 +484,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
483 ret = pagelen; 484 ret = pagelen;
484out_unlock: 485out_unlock:
485 unlock_page(page); 486 unlock_page(page);
487 if (ret)
488 ret = VM_FAULT_SIGBUS;
486 return ret; 489 return ret;
487} 490}
488 491
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f2..46177cb8706 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
156 return ret; 156 return ret;
157 } 157 }
158 158
159 if (fattr.type != NFDIR) { 159 if (!S_ISDIR(fattr.mode)) {
160 printk(KERN_ERR "nfs4_get_root:" 160 printk(KERN_ERR "nfs4_get_root:"
161 " getroot encountered non-directory\n"); 161 " getroot encountered non-directory\n");
162 return -ENOTDIR; 162 return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
213 return ret; 213 return ret;
214 } 214 }
215 215
216 if (fattr.type != NFDIR) { 216 if (!S_ISDIR(fattr.mode)) {
217 printk(KERN_ERR "nfs4_get_root:" 217 printk(KERN_ERR "nfs4_get_root:"
218 " lookupfh encountered non-directory\n"); 218 " lookupfh encountered non-directory\n");
219 return -ENOTDIR; 219 return -ENOTDIR;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171..a834d1d850b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -66,6 +66,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
66} 66}
67 67
68/** 68/**
69 * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
70 * @word: long word containing the bit lock
71 */
72int nfs_wait_bit_killable(void *word)
73{
74 if (fatal_signal_pending(current))
75 return -ERESTARTSYS;
76 schedule();
77 return 0;
78}
79
80/**
69 * nfs_compat_user_ino64 - returns the user-visible inode number 81 * nfs_compat_user_ino64 - returns the user-visible inode number
70 * @fileid: 64-bit fileid 82 * @fileid: 64-bit fileid
71 * 83 *
@@ -249,13 +261,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
249 struct inode *inode = ERR_PTR(-ENOENT); 261 struct inode *inode = ERR_PTR(-ENOENT);
250 unsigned long hash; 262 unsigned long hash;
251 263
252 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 264 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
253 goto out_no_inode; 265 goto out_no_inode;
254 266 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
255 if (!fattr->nlink) {
256 printk("NFS: Buggy server - nlink == 0!\n");
257 goto out_no_inode; 267 goto out_no_inode;
258 }
259 268
260 hash = nfs_fattr_to_ino_t(fattr); 269 hash = nfs_fattr_to_ino_t(fattr);
261 270
@@ -291,7 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 && fattr->size <= NFS_LIMIT_READDIRPLUS) 300 && fattr->size <= NFS_LIMIT_READDIRPLUS)
292 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 301 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
293 /* Deal with crossing mountpoints */ 302 /* Deal with crossing mountpoints */
294 if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { 303 if ((fattr->valid & NFS_ATTR_FATTR_FSID)
304 && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
295 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 305 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
296 inode->i_op = &nfs_referral_inode_operations; 306 inode->i_op = &nfs_referral_inode_operations;
297 else 307 else
@@ -304,28 +314,45 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
304 else 314 else
305 init_special_inode(inode, inode->i_mode, fattr->rdev); 315 init_special_inode(inode, inode->i_mode, fattr->rdev);
306 316
317 memset(&inode->i_atime, 0, sizeof(inode->i_atime));
318 memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
319 memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
320 nfsi->change_attr = 0;
321 inode->i_size = 0;
322 inode->i_nlink = 0;
323 inode->i_uid = -2;
324 inode->i_gid = -2;
325 inode->i_blocks = 0;
326 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
327
307 nfsi->read_cache_jiffies = fattr->time_start; 328 nfsi->read_cache_jiffies = fattr->time_start;
308 nfsi->attr_gencount = fattr->gencount; 329 nfsi->attr_gencount = fattr->gencount;
309 inode->i_atime = fattr->atime; 330 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
310 inode->i_mtime = fattr->mtime; 331 inode->i_atime = fattr->atime;
311 inode->i_ctime = fattr->ctime; 332 if (fattr->valid & NFS_ATTR_FATTR_MTIME)
312 if (fattr->valid & NFS_ATTR_FATTR_V4) 333 inode->i_mtime = fattr->mtime;
334 if (fattr->valid & NFS_ATTR_FATTR_CTIME)
335 inode->i_ctime = fattr->ctime;
336 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
313 nfsi->change_attr = fattr->change_attr; 337 nfsi->change_attr = fattr->change_attr;
314 inode->i_size = nfs_size_to_loff_t(fattr->size); 338 if (fattr->valid & NFS_ATTR_FATTR_SIZE)
315 inode->i_nlink = fattr->nlink; 339 inode->i_size = nfs_size_to_loff_t(fattr->size);
316 inode->i_uid = fattr->uid; 340 if (fattr->valid & NFS_ATTR_FATTR_NLINK)
317 inode->i_gid = fattr->gid; 341 inode->i_nlink = fattr->nlink;
318 if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { 342 if (fattr->valid & NFS_ATTR_FATTR_OWNER)
343 inode->i_uid = fattr->uid;
344 if (fattr->valid & NFS_ATTR_FATTR_GROUP)
345 inode->i_gid = fattr->gid;
346 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
347 inode->i_blocks = fattr->du.nfs2.blocks;
348 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
319 /* 349 /*
320 * report the blocks in 512byte units 350 * report the blocks in 512byte units
321 */ 351 */
322 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 352 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
323 } else {
324 inode->i_blocks = fattr->du.nfs2.blocks;
325 } 353 }
326 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 354 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
327 nfsi->attrtimeo_timestamp = now; 355 nfsi->attrtimeo_timestamp = now;
328 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
329 nfsi->access_cache = RB_ROOT; 356 nfsi->access_cache = RB_ROOT;
330 357
331 unlock_new_inode(inode); 358 unlock_new_inode(inode);
@@ -514,6 +541,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
514 return err; 541 return err;
515} 542}
516 543
544/**
545 * nfs_close_context - Common close_context() routine NFSv2/v3
546 * @ctx: pointer to context
547 * @is_sync: is this a synchronous close
548 *
549 * always ensure that the attributes are up to date if we're mounted
550 * with close-to-open semantics
551 */
552void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
553{
554 struct inode *inode;
555 struct nfs_server *server;
556
557 if (!(ctx->mode & FMODE_WRITE))
558 return;
559 if (!is_sync)
560 return;
561 inode = ctx->path.dentry->d_inode;
562 if (!list_empty(&NFS_I(inode)->open_files))
563 return;
564 server = NFS_SERVER(inode);
565 if (server->flags & NFS_MOUNT_NOCTO)
566 return;
567 nfs_revalidate_inode(server, inode);
568}
569
517static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) 570static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
518{ 571{
519 struct nfs_open_context *ctx; 572 struct nfs_open_context *ctx;
@@ -540,24 +593,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
540 return ctx; 593 return ctx;
541} 594}
542 595
543static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) 596static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
544{ 597{
545 struct inode *inode; 598 struct inode *inode = ctx->path.dentry->d_inode;
546
547 if (ctx == NULL)
548 return;
549 599
550 inode = ctx->path.dentry->d_inode;
551 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) 600 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
552 return; 601 return;
553 list_del(&ctx->list); 602 list_del(&ctx->list);
554 spin_unlock(&inode->i_lock); 603 spin_unlock(&inode->i_lock);
555 if (ctx->state != NULL) { 604 NFS_PROTO(inode)->close_context(ctx, is_sync);
556 if (wait)
557 nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
558 else
559 nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
560 }
561 if (ctx->cred != NULL) 605 if (ctx->cred != NULL)
562 put_rpccred(ctx->cred); 606 put_rpccred(ctx->cred);
563 path_put(&ctx->path); 607 path_put(&ctx->path);
@@ -670,9 +714,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
670 if (NFS_STALE(inode)) 714 if (NFS_STALE(inode))
671 goto out; 715 goto out;
672 716
673 if (NFS_STALE(inode))
674 goto out;
675
676 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 717 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
677 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 718 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
678 if (status != 0) { 719 if (status != 0) {
@@ -815,25 +856,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
815{ 856{
816 struct nfs_inode *nfsi = NFS_I(inode); 857 struct nfs_inode *nfsi = NFS_I(inode);
817 858
818 if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 && 859 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
819 nfsi->change_attr == fattr->pre_change_attr) { 860 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
861 && nfsi->change_attr == fattr->pre_change_attr) {
820 nfsi->change_attr = fattr->change_attr; 862 nfsi->change_attr = fattr->change_attr;
821 if (S_ISDIR(inode->i_mode)) 863 if (S_ISDIR(inode->i_mode))
822 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 864 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
823 } 865 }
824 /* If we have atomic WCC data, we may update some attributes */ 866 /* If we have atomic WCC data, we may update some attributes */
825 if ((fattr->valid & NFS_ATTR_WCC) != 0) { 867 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
826 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) 868 && (fattr->valid & NFS_ATTR_FATTR_CTIME)
869 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
827 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 870 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
828 if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 871
872 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
873 && (fattr->valid & NFS_ATTR_FATTR_MTIME)
874 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
829 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 875 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
830 if (S_ISDIR(inode->i_mode)) 876 if (S_ISDIR(inode->i_mode))
831 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 877 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
832 }
833 if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
834 nfsi->npages == 0)
835 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
836 } 878 }
879 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
880 && (fattr->valid & NFS_ATTR_FATTR_SIZE)
881 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
882 && nfsi->npages == 0)
883 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
837} 884}
838 885
839/** 886/**
@@ -853,35 +900,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
853 900
854 901
855 /* Has the inode gone and changed behind our back? */ 902 /* Has the inode gone and changed behind our back? */
856 if (nfsi->fileid != fattr->fileid 903 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
857 || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { 904 return -EIO;
905 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
858 return -EIO; 906 return -EIO;
859 }
860 907
861 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 908 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
862 nfsi->change_attr != fattr->change_attr) 909 nfsi->change_attr != fattr->change_attr)
863 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 910 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
864 911
865 /* Verify a few of the more important attributes */ 912 /* Verify a few of the more important attributes */
866 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) 913 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
867 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 914 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
868 915
869 cur_size = i_size_read(inode); 916 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
870 new_isize = nfs_size_to_loff_t(fattr->size); 917 cur_size = i_size_read(inode);
871 if (cur_size != new_isize && nfsi->npages == 0) 918 new_isize = nfs_size_to_loff_t(fattr->size);
872 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 919 if (cur_size != new_isize && nfsi->npages == 0)
920 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
921 }
873 922
874 /* Have any file permissions changed? */ 923 /* Have any file permissions changed? */
875 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) 924 if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
876 || inode->i_uid != fattr->uid 925 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
877 || inode->i_gid != fattr->gid) 926 if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
927 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
928 if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
878 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; 929 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
879 930
880 /* Has the link count changed? */ 931 /* Has the link count changed? */
881 if (inode->i_nlink != fattr->nlink) 932 if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
882 invalid |= NFS_INO_INVALID_ATTR; 933 invalid |= NFS_INO_INVALID_ATTR;
883 934
884 if (!timespec_equal(&inode->i_atime, &fattr->atime)) 935 if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
885 invalid |= NFS_INO_INVALID_ATIME; 936 invalid |= NFS_INO_INVALID_ATIME;
886 937
887 if (invalid != 0) 938 if (invalid != 0)
@@ -893,11 +944,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
893 944
894static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) 945static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
895{ 946{
947 if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
948 return 0;
896 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; 949 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
897} 950}
898 951
899static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) 952static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
900{ 953{
954 if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
955 return 0;
901 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); 956 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
902} 957}
903 958
@@ -1033,20 +1088,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
1033 /* Don't do a WCC update if these attributes are already stale */ 1088 /* Don't do a WCC update if these attributes are already stale */
1034 if ((fattr->valid & NFS_ATTR_FATTR) == 0 || 1089 if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
1035 !nfs_inode_attrs_need_update(inode, fattr)) { 1090 !nfs_inode_attrs_need_update(inode, fattr)) {
1036 fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); 1091 fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
1092 | NFS_ATTR_FATTR_PRESIZE
1093 | NFS_ATTR_FATTR_PREMTIME
1094 | NFS_ATTR_FATTR_PRECTIME);
1037 goto out_noforce; 1095 goto out_noforce;
1038 } 1096 }
1039 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 1097 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
1040 (fattr->valid & NFS_ATTR_WCC_V4) == 0) { 1098 (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
1041 fattr->pre_change_attr = NFS_I(inode)->change_attr; 1099 fattr->pre_change_attr = NFS_I(inode)->change_attr;
1042 fattr->valid |= NFS_ATTR_WCC_V4; 1100 fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
1043 } 1101 }
1044 if ((fattr->valid & NFS_ATTR_FATTR) != 0 && 1102 if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
1045 (fattr->valid & NFS_ATTR_WCC) == 0) { 1103 (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
1046 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); 1104 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
1105 fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
1106 }
1107 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
1108 (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
1047 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); 1109 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
1110 fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
1111 }
1112 if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
1113 (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
1048 fattr->pre_size = i_size_read(inode); 1114 fattr->pre_size = i_size_read(inode);
1049 fattr->valid |= NFS_ATTR_WCC; 1115 fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
1050 } 1116 }
1051out_noforce: 1117out_noforce:
1052 status = nfs_post_op_update_inode_locked(inode, fattr); 1118 status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1078,18 +1144,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1078 __func__, inode->i_sb->s_id, inode->i_ino, 1144 __func__, inode->i_sb->s_id, inode->i_ino,
1079 atomic_read(&inode->i_count), fattr->valid); 1145 atomic_read(&inode->i_count), fattr->valid);
1080 1146
1081 if (nfsi->fileid != fattr->fileid) 1147 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
1082 goto out_fileid; 1148 goto out_fileid;
1083 1149
1084 /* 1150 /*
1085 * Make sure the inode's type hasn't changed. 1151 * Make sure the inode's type hasn't changed.
1086 */ 1152 */
1087 if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) 1153 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
1088 goto out_changed; 1154 goto out_changed;
1089 1155
1090 server = NFS_SERVER(inode); 1156 server = NFS_SERVER(inode);
1091 /* Update the fsid? */ 1157 /* Update the fsid? */
1092 if (S_ISDIR(inode->i_mode) && 1158 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
1093 !nfs_fsid_equal(&server->fsid, &fattr->fsid) && 1159 !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
1094 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) 1160 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
1095 server->fsid = fattr->fsid; 1161 server->fsid = fattr->fsid;
@@ -1099,14 +1165,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1099 */ 1165 */
1100 nfsi->read_cache_jiffies = fattr->time_start; 1166 nfsi->read_cache_jiffies = fattr->time_start;
1101 1167
1102 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME 1168 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
1103 | NFS_INO_REVAL_PAGECACHE); 1169 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
1170 | NFS_INO_INVALID_ATIME
1171 | NFS_INO_REVAL_PAGECACHE);
1104 1172
1105 /* Do atomic weak cache consistency updates */ 1173 /* Do atomic weak cache consistency updates */
1106 nfs_wcc_update_inode(inode, fattr); 1174 nfs_wcc_update_inode(inode, fattr);
1107 1175
1108 /* More cache consistency checks */ 1176 /* More cache consistency checks */
1109 if (!(fattr->valid & NFS_ATTR_FATTR_V4)) { 1177 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
1178 if (nfsi->change_attr != fattr->change_attr) {
1179 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1180 inode->i_sb->s_id, inode->i_ino);
1181 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1182 if (S_ISDIR(inode->i_mode))
1183 nfs_force_lookup_revalidate(inode);
1184 nfsi->change_attr = fattr->change_attr;
1185 }
1186 }
1187
1188 if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
1110 /* NFSv2/v3: Check if the mtime agrees */ 1189 /* NFSv2/v3: Check if the mtime agrees */
1111 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { 1190 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
1112 dprintk("NFS: mtime change on server for file %s/%ld\n", 1191 dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1114,59 +1193,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1114 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1193 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1115 if (S_ISDIR(inode->i_mode)) 1194 if (S_ISDIR(inode->i_mode))
1116 nfs_force_lookup_revalidate(inode); 1195 nfs_force_lookup_revalidate(inode);
1196 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1117 } 1197 }
1198 }
1199 if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
1118 /* If ctime has changed we should definitely clear access+acl caches */ 1200 /* If ctime has changed we should definitely clear access+acl caches */
1119 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) 1201 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
1120 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1202 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1121 } else if (nfsi->change_attr != fattr->change_attr) { 1203 /* and probably clear data for a directory too as utimes can cause
1122 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1204 * havoc with our cache.
1123 inode->i_sb->s_id, inode->i_ino); 1205 */
1124 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1206 if (S_ISDIR(inode->i_mode)) {
1125 if (S_ISDIR(inode->i_mode)) 1207 invalid |= NFS_INO_INVALID_DATA;
1126 nfs_force_lookup_revalidate(inode); 1208 nfs_force_lookup_revalidate(inode);
1209 }
1210 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1211 }
1127 } 1212 }
1128 1213
1129 /* Check if our cached file size is stale */ 1214 /* Check if our cached file size is stale */
1130 new_isize = nfs_size_to_loff_t(fattr->size); 1215 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
1131 cur_isize = i_size_read(inode); 1216 new_isize = nfs_size_to_loff_t(fattr->size);
1132 if (new_isize != cur_isize) { 1217 cur_isize = i_size_read(inode);
1133 /* Do we perhaps have any outstanding writes, or has 1218 if (new_isize != cur_isize) {
1134 * the file grown beyond our last write? */ 1219 /* Do we perhaps have any outstanding writes, or has
1135 if (nfsi->npages == 0 || new_isize > cur_isize) { 1220 * the file grown beyond our last write? */
1136 i_size_write(inode, new_isize); 1221 if (nfsi->npages == 0 || new_isize > cur_isize) {
1137 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1222 i_size_write(inode, new_isize);
1223 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1224 }
1225 dprintk("NFS: isize change on server for file %s/%ld\n",
1226 inode->i_sb->s_id, inode->i_ino);
1138 } 1227 }
1139 dprintk("NFS: isize change on server for file %s/%ld\n",
1140 inode->i_sb->s_id, inode->i_ino);
1141 } 1228 }
1142 1229
1143 1230
1144 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1231 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
1145 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1232 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1146 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1147 nfsi->change_attr = fattr->change_attr;
1148
1149 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
1150 inode->i_uid != fattr->uid ||
1151 inode->i_gid != fattr->gid)
1152 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1153 1233
1154 if (inode->i_nlink != fattr->nlink) 1234 if (fattr->valid & NFS_ATTR_FATTR_MODE) {
1155 invalid |= NFS_INO_INVALID_ATTR; 1235 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
1236 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1237 inode->i_mode = fattr->mode;
1238 }
1239 }
1240 if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
1241 if (inode->i_uid != fattr->uid) {
1242 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1243 inode->i_uid = fattr->uid;
1244 }
1245 }
1246 if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
1247 if (inode->i_gid != fattr->gid) {
1248 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1249 inode->i_gid = fattr->gid;
1250 }
1251 }
1156 1252
1157 inode->i_mode = fattr->mode; 1253 if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
1158 inode->i_nlink = fattr->nlink; 1254 if (inode->i_nlink != fattr->nlink) {
1159 inode->i_uid = fattr->uid; 1255 invalid |= NFS_INO_INVALID_ATTR;
1160 inode->i_gid = fattr->gid; 1256 if (S_ISDIR(inode->i_mode))
1257 invalid |= NFS_INO_INVALID_DATA;
1258 inode->i_nlink = fattr->nlink;
1259 }
1260 }
1161 1261
1162 if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { 1262 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
1163 /* 1263 /*
1164 * report the blocks in 512byte units 1264 * report the blocks in 512byte units
1165 */ 1265 */
1166 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 1266 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
1167 } else {
1168 inode->i_blocks = fattr->du.nfs2.blocks;
1169 } 1267 }
1268 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
1269 inode->i_blocks = fattr->du.nfs2.blocks;
1170 1270
1171 /* Update attrtimeo value if we're out of the unstable period */ 1271 /* Update attrtimeo value if we're out of the unstable period */
1172 if (invalid & NFS_INO_INVALID_ATTR) { 1272 if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1274,7 +1374,6 @@ static void init_once(void *foo)
1274 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); 1374 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
1275 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1375 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1276 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1376 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1277 nfsi->ncommit = 0;
1278 nfsi->npages = 0; 1377 nfsi->npages = 0;
1279 atomic_set(&nfsi->silly_count, 1); 1378 atomic_set(&nfsi->silly_count, 1);
1280 INIT_HLIST_HEAD(&nfsi->silly_list); 1379 INIT_HLIST_HEAD(&nfsi->silly_list);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608..2041f68ff1c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -152,6 +152,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
152extern struct rpc_procinfo nfs4_procedures[]; 152extern struct rpc_procinfo nfs4_procedures[];
153#endif 153#endif
154 154
155/* proc.c */
156void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
157
155/* dir.c */ 158/* dir.c */
156extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); 159extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
157 160
@@ -165,6 +168,7 @@ extern void nfs_clear_inode(struct inode *);
165extern void nfs4_clear_inode(struct inode *); 168extern void nfs4_clear_inode(struct inode *);
166#endif 169#endif
167void nfs_zap_acl_cache(struct inode *inode); 170void nfs_zap_acl_cache(struct inode *inode);
171extern int nfs_wait_bit_killable(void *word);
168 172
169/* super.c */ 173/* super.c */
170void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); 174void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d151..c862c9340f9 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
120static __be32 * 120static __be32 *
121xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 121xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
122{ 122{
123 u32 rdev; 123 u32 rdev, type;
124 fattr->type = (enum nfs_ftype) ntohl(*p++); 124 type = ntohl(*p++);
125 fattr->mode = ntohl(*p++); 125 fattr->mode = ntohl(*p++);
126 fattr->nlink = ntohl(*p++); 126 fattr->nlink = ntohl(*p++);
127 fattr->uid = ntohl(*p++); 127 fattr->uid = ntohl(*p++);
@@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
136 p = xdr_decode_time(p, &fattr->atime); 136 p = xdr_decode_time(p, &fattr->atime);
137 p = xdr_decode_time(p, &fattr->mtime); 137 p = xdr_decode_time(p, &fattr->mtime);
138 p = xdr_decode_time(p, &fattr->ctime); 138 p = xdr_decode_time(p, &fattr->ctime);
139 fattr->valid |= NFS_ATTR_FATTR; 139 fattr->valid |= NFS_ATTR_FATTR_V2;
140 fattr->rdev = new_decode_dev(rdev); 140 fattr->rdev = new_decode_dev(rdev);
141 if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) { 141 if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
142 fattr->type = NFFIFO;
143 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; 142 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
144 fattr->rdev = 0; 143 fattr->rdev = 0;
145 } 144 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679..b82fe6847f1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
834 .commit_done = nfs3_commit_done, 834 .commit_done = nfs3_commit_done,
835 .lock = nfs3_proc_lock, 835 .lock = nfs3_proc_lock,
836 .clear_acl_cache = nfs3_forget_cached_acls, 836 .clear_acl_cache = nfs3_forget_cached_acls,
837 .close_context = nfs_close_context,
837}; 838};
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde4..e6a1932c711 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
91/* 91/*
92 * Map file type to S_IFMT bits 92 * Map file type to S_IFMT bits
93 */ 93 */
94static struct { 94static const umode_t nfs_type2fmt[] = {
95 unsigned int mode; 95 [NF3BAD] = 0,
96 unsigned int nfs2type; 96 [NF3REG] = S_IFREG,
97} nfs_type2fmt[] = { 97 [NF3DIR] = S_IFDIR,
98 { 0, NFNON }, 98 [NF3BLK] = S_IFBLK,
99 { S_IFREG, NFREG }, 99 [NF3CHR] = S_IFCHR,
100 { S_IFDIR, NFDIR }, 100 [NF3LNK] = S_IFLNK,
101 { S_IFBLK, NFBLK }, 101 [NF3SOCK] = S_IFSOCK,
102 { S_IFCHR, NFCHR }, 102 [NF3FIFO] = S_IFIFO,
103 { S_IFLNK, NFLNK },
104 { S_IFSOCK, NFSOCK },
105 { S_IFIFO, NFFIFO },
106 { 0, NFBAD }
107}; 103};
108 104
109/* 105/*
@@ -148,13 +144,12 @@ static __be32 *
148xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 144xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
149{ 145{
150 unsigned int type, major, minor; 146 unsigned int type, major, minor;
151 int fmode; 147 umode_t fmode;
152 148
153 type = ntohl(*p++); 149 type = ntohl(*p++);
154 if (type >= NF3BAD) 150 if (type > NF3FIFO)
155 type = NF3BAD; 151 type = NF3NON;
156 fmode = nfs_type2fmt[type].mode; 152 fmode = nfs_type2fmt[type];
157 fattr->type = nfs_type2fmt[type].nfs2type;
158 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; 153 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
159 fattr->nlink = ntohl(*p++); 154 fattr->nlink = ntohl(*p++);
160 fattr->uid = ntohl(*p++); 155 fattr->uid = ntohl(*p++);
@@ -177,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
177 p = xdr_decode_time3(p, &fattr->ctime); 172 p = xdr_decode_time3(p, &fattr->ctime);
178 173
179 /* Update the mode bits */ 174 /* Update the mode bits */
180 fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); 175 fattr->valid |= NFS_ATTR_FATTR_V3;
181 return p; 176 return p;
182} 177}
183 178
@@ -233,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
233 p = xdr_decode_hyper(p, &fattr->pre_size); 228 p = xdr_decode_hyper(p, &fattr->pre_size);
234 p = xdr_decode_time3(p, &fattr->pre_mtime); 229 p = xdr_decode_time3(p, &fattr->pre_mtime);
235 p = xdr_decode_time3(p, &fattr->pre_ctime); 230 p = xdr_decode_time3(p, &fattr->pre_ctime);
236 fattr->valid |= NFS_ATTR_WCC; 231 fattr->valid |= NFS_ATTR_FATTR_PRESIZE
232 | NFS_ATTR_FATTR_PREMTIME
233 | NFS_ATTR_FATTR_PRECTIME;
237 return p; 234 return p;
238} 235}
239 236
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d..97bacccff57 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
193 kunmap_atomic(start, KM_USER0); 193 kunmap_atomic(start, KM_USER0);
194} 194}
195 195
196static int nfs4_wait_bit_killable(void *word)
197{
198 if (fatal_signal_pending(current))
199 return -ERESTARTSYS;
200 schedule();
201 return 0;
202}
203
204static int nfs4_wait_clnt_recover(struct nfs_client *clp) 196static int nfs4_wait_clnt_recover(struct nfs_client *clp)
205{ 197{
206 int res; 198 int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
208 might_sleep(); 200 might_sleep();
209 201
210 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, 202 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
211 nfs4_wait_bit_killable, TASK_KILLABLE); 203 nfs_wait_bit_killable, TASK_KILLABLE);
212 return res; 204 return res;
213} 205}
214 206
@@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1439 if (calldata->arg.seqid == NULL) 1431 if (calldata->arg.seqid == NULL)
1440 goto out_free_calldata; 1432 goto out_free_calldata;
1441 calldata->arg.fmode = 0; 1433 calldata->arg.fmode = 0;
1442 calldata->arg.bitmask = server->attr_bitmask; 1434 calldata->arg.bitmask = server->cache_consistency_bitmask;
1443 calldata->res.fattr = &calldata->fattr; 1435 calldata->res.fattr = &calldata->fattr;
1444 calldata->res.seqid = calldata->arg.seqid; 1436 calldata->res.seqid = calldata->arg.seqid;
1445 calldata->res.server = server; 1437 calldata->res.server = server;
@@ -1580,6 +1572,15 @@ out_drop:
1580 return 0; 1572 return 0;
1581} 1573}
1582 1574
1575void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
1576{
1577 if (ctx->state == NULL)
1578 return;
1579 if (is_sync)
1580 nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
1581 else
1582 nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
1583}
1583 1584
1584static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) 1585static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
1585{ 1586{
@@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
1600 server->caps |= NFS_CAP_HARDLINKS; 1601 server->caps |= NFS_CAP_HARDLINKS;
1601 if (res.has_symlinks != 0) 1602 if (res.has_symlinks != 0)
1602 server->caps |= NFS_CAP_SYMLINKS; 1603 server->caps |= NFS_CAP_SYMLINKS;
1604 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
1605 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
1606 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
1603 server->acl_bitmask = res.acl_bitmask; 1607 server->acl_bitmask = res.acl_bitmask;
1604 } 1608 }
1605 return status; 1609 return status;
@@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
2079 struct nfs_removeargs *args = msg->rpc_argp; 2083 struct nfs_removeargs *args = msg->rpc_argp;
2080 struct nfs_removeres *res = msg->rpc_resp; 2084 struct nfs_removeres *res = msg->rpc_resp;
2081 2085
2082 args->bitmask = server->attr_bitmask; 2086 args->bitmask = server->cache_consistency_bitmask;
2083 res->server = server; 2087 res->server = server;
2084 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 2088 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
2085} 2089}
@@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2323 .pages = &page, 2327 .pages = &page,
2324 .pgbase = 0, 2328 .pgbase = 0,
2325 .count = count, 2329 .count = count,
2326 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, 2330 .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
2327 }; 2331 };
2328 struct nfs4_readdir_res res; 2332 struct nfs4_readdir_res res;
2329 struct rpc_message msg = { 2333 struct rpc_message msg = {
@@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
2552{ 2556{
2553 struct nfs_server *server = NFS_SERVER(data->inode); 2557 struct nfs_server *server = NFS_SERVER(data->inode);
2554 2558
2555 data->args.bitmask = server->attr_bitmask; 2559 data->args.bitmask = server->cache_consistency_bitmask;
2556 data->res.server = server; 2560 data->res.server = server;
2557 data->timestamp = jiffies; 2561 data->timestamp = jiffies;
2558 2562
@@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
2575{ 2579{
2576 struct nfs_server *server = NFS_SERVER(data->inode); 2580 struct nfs_server *server = NFS_SERVER(data->inode);
2577 2581
2578 data->args.bitmask = server->attr_bitmask; 2582 data->args.bitmask = server->cache_consistency_bitmask;
2579 data->res.server = server; 2583 data->res.server = server;
2580 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 2584 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
2581} 2585}
@@ -3678,6 +3682,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
3678 return len; 3682 return len;
3679} 3683}
3680 3684
3685static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
3686{
3687 if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
3688 (fattr->valid & NFS_ATTR_FATTR_FSID) &&
3689 (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
3690 return;
3691
3692 fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
3693 NFS_ATTR_FATTR_NLINK;
3694 fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
3695 fattr->nlink = 2;
3696}
3697
3681int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 3698int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3682 struct nfs4_fs_locations *fs_locations, struct page *page) 3699 struct nfs4_fs_locations *fs_locations, struct page *page)
3683{ 3700{
@@ -3704,6 +3721,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3704 fs_locations->server = server; 3721 fs_locations->server = server;
3705 fs_locations->nlocations = 0; 3722 fs_locations->nlocations = 0;
3706 status = rpc_call_sync(server->client, &msg, 0); 3723 status = rpc_call_sync(server->client, &msg, 0);
3724 nfs_fixup_referral_attributes(&fs_locations->fattr);
3707 dprintk("%s: returned status = %d\n", __func__, status); 3725 dprintk("%s: returned status = %d\n", __func__, status);
3708 return status; 3726 return status;
3709} 3727}
@@ -3767,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
3767 .commit_done = nfs4_commit_done, 3785 .commit_done = nfs4_commit_done,
3768 .lock = nfs4_proc_lock, 3786 .lock = nfs4_proc_lock,
3769 .clear_acl_cache = nfs4_zap_acl_attr, 3787 .clear_acl_cache = nfs4_zap_acl_attr,
3788 .close_context = nfs4_close_context,
3770}; 3789};
3771 3790
3772/* 3791/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966..0298e909559 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
62 62
63static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) 63static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
64{ 64{
65 int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, 65 unsigned short port;
66 nfs_callback_tcpport, cred); 66 int status;
67
68 port = nfs_callback_tcpport;
69 if (clp->cl_addr.ss_family == AF_INET6)
70 port = nfs_callback_tcpport6;
71
72 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
67 if (status == 0) 73 if (status == 0)
68 status = nfs4_proc_setclientid_confirm(clp, cred); 74 status = nfs4_proc_setclientid_confirm(clp, cred);
69 if (status == 0) 75 if (status == 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a..1690f0e44b9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
522 decode_lookup_maxsz + \ 522 decode_lookup_maxsz + \
523 decode_fs_locations_maxsz) 523 decode_fs_locations_maxsz)
524 524
525static struct { 525static const umode_t nfs_type2fmt[] = {
526 unsigned int mode; 526 [NF4BAD] = 0,
527 unsigned int nfs2type; 527 [NF4REG] = S_IFREG,
528} nfs_type2fmt[] = { 528 [NF4DIR] = S_IFDIR,
529 { 0, NFNON }, 529 [NF4BLK] = S_IFBLK,
530 { S_IFREG, NFREG }, 530 [NF4CHR] = S_IFCHR,
531 { S_IFDIR, NFDIR }, 531 [NF4LNK] = S_IFLNK,
532 { S_IFBLK, NFBLK }, 532 [NF4SOCK] = S_IFSOCK,
533 { S_IFCHR, NFCHR }, 533 [NF4FIFO] = S_IFIFO,
534 { S_IFLNK, NFLNK }, 534 [NF4ATTRDIR] = 0,
535 { S_IFSOCK, NFSOCK }, 535 [NF4NAMEDATTR] = 0,
536 { S_IFIFO, NFFIFO },
537 { 0, NFNON },
538 { 0, NFNON },
539}; 536};
540 537
541struct compound_hdr { 538struct compound_hdr {
@@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
2160static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) 2157static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
2161{ 2158{
2162 __be32 *p; 2159 __be32 *p;
2160 int ret = 0;
2163 2161
2164 *type = 0; 2162 *type = 0;
2165 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) 2163 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2172 return -EIO; 2170 return -EIO;
2173 } 2171 }
2174 bitmap[0] &= ~FATTR4_WORD0_TYPE; 2172 bitmap[0] &= ~FATTR4_WORD0_TYPE;
2173 ret = NFS_ATTR_FATTR_TYPE;
2175 } 2174 }
2176 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type); 2175 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
2177 return 0; 2176 return ret;
2178} 2177}
2179 2178
2180static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) 2179static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
2181{ 2180{
2182 __be32 *p; 2181 __be32 *p;
2182 int ret = 0;
2183 2183
2184 *change = 0; 2184 *change = 0;
2185 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) 2185 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2188 READ_BUF(8); 2188 READ_BUF(8);
2189 READ64(*change); 2189 READ64(*change);
2190 bitmap[0] &= ~FATTR4_WORD0_CHANGE; 2190 bitmap[0] &= ~FATTR4_WORD0_CHANGE;
2191 ret = NFS_ATTR_FATTR_CHANGE;
2191 } 2192 }
2192 dprintk("%s: change attribute=%Lu\n", __func__, 2193 dprintk("%s: change attribute=%Lu\n", __func__,
2193 (unsigned long long)*change); 2194 (unsigned long long)*change);
2194 return 0; 2195 return ret;
2195} 2196}
2196 2197
2197static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) 2198static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
2198{ 2199{
2199 __be32 *p; 2200 __be32 *p;
2201 int ret = 0;
2200 2202
2201 *size = 0; 2203 *size = 0;
2202 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) 2204 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
2205 READ_BUF(8); 2207 READ_BUF(8);
2206 READ64(*size); 2208 READ64(*size);
2207 bitmap[0] &= ~FATTR4_WORD0_SIZE; 2209 bitmap[0] &= ~FATTR4_WORD0_SIZE;
2210 ret = NFS_ATTR_FATTR_SIZE;
2208 } 2211 }
2209 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); 2212 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
2210 return 0; 2213 return ret;
2211} 2214}
2212 2215
2213static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2216static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
2245static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) 2248static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
2246{ 2249{
2247 __be32 *p; 2250 __be32 *p;
2251 int ret = 0;
2248 2252
2249 fsid->major = 0; 2253 fsid->major = 0;
2250 fsid->minor = 0; 2254 fsid->minor = 0;
@@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2255 READ64(fsid->major); 2259 READ64(fsid->major);
2256 READ64(fsid->minor); 2260 READ64(fsid->minor);
2257 bitmap[0] &= ~FATTR4_WORD0_FSID; 2261 bitmap[0] &= ~FATTR4_WORD0_FSID;
2262 ret = NFS_ATTR_FATTR_FSID;
2258 } 2263 }
2259 dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, 2264 dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
2260 (unsigned long long)fsid->major, 2265 (unsigned long long)fsid->major,
2261 (unsigned long long)fsid->minor); 2266 (unsigned long long)fsid->minor);
2262 return 0; 2267 return ret;
2263} 2268}
2264 2269
2265static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2270static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
2297static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2302static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2298{ 2303{
2299 __be32 *p; 2304 __be32 *p;
2305 int ret = 0;
2300 2306
2301 *fileid = 0; 2307 *fileid = 0;
2302 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) 2308 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2305 READ_BUF(8); 2311 READ_BUF(8);
2306 READ64(*fileid); 2312 READ64(*fileid);
2307 bitmap[0] &= ~FATTR4_WORD0_FILEID; 2313 bitmap[0] &= ~FATTR4_WORD0_FILEID;
2314 ret = NFS_ATTR_FATTR_FILEID;
2308 } 2315 }
2309 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2316 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2310 return 0; 2317 return ret;
2311} 2318}
2312 2319
2313static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2320static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2314{ 2321{
2315 __be32 *p; 2322 __be32 *p;
2323 int ret = 0;
2316 2324
2317 *fileid = 0; 2325 *fileid = 0;
2318 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) 2326 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
2321 READ_BUF(8); 2329 READ_BUF(8);
2322 READ64(*fileid); 2330 READ64(*fileid);
2323 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 2331 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
2332 ret = NFS_ATTR_FATTR_FILEID;
2324 } 2333 }
2325 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2334 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2326 return 0; 2335 return ret;
2327} 2336}
2328 2337
2329static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2338static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2479 if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) 2488 if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
2480 res->nlocations++; 2489 res->nlocations++;
2481 } 2490 }
2491 if (res->nlocations != 0)
2492 status = NFS_ATTR_FATTR_V4_REFERRAL;
2482out: 2493out:
2483 dprintk("%s: fs_locations done, error = %d\n", __func__, status); 2494 dprintk("%s: fs_locations done, error = %d\n", __func__, status);
2484 return status; 2495 return status;
@@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
2580 return status; 2591 return status;
2581} 2592}
2582 2593
2583static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode) 2594static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
2584{ 2595{
2596 uint32_t tmp;
2585 __be32 *p; 2597 __be32 *p;
2598 int ret = 0;
2586 2599
2587 *mode = 0; 2600 *mode = 0;
2588 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) 2601 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
2589 return -EIO; 2602 return -EIO;
2590 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { 2603 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
2591 READ_BUF(4); 2604 READ_BUF(4);
2592 READ32(*mode); 2605 READ32(tmp);
2593 *mode &= ~S_IFMT; 2606 *mode = tmp & ~S_IFMT;
2594 bitmap[1] &= ~FATTR4_WORD1_MODE; 2607 bitmap[1] &= ~FATTR4_WORD1_MODE;
2608 ret = NFS_ATTR_FATTR_MODE;
2595 } 2609 }
2596 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); 2610 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
2597 return 0; 2611 return ret;
2598} 2612}
2599 2613
2600static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) 2614static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
2601{ 2615{
2602 __be32 *p; 2616 __be32 *p;
2617 int ret = 0;
2603 2618
2604 *nlink = 1; 2619 *nlink = 1;
2605 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) 2620 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
2608 READ_BUF(4); 2623 READ_BUF(4);
2609 READ32(*nlink); 2624 READ32(*nlink);
2610 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; 2625 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
2626 ret = NFS_ATTR_FATTR_NLINK;
2611 } 2627 }
2612 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); 2628 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
2613 return 0; 2629 return ret;
2614} 2630}
2615 2631
2616static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) 2632static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
2617{ 2633{
2618 uint32_t len; 2634 uint32_t len;
2619 __be32 *p; 2635 __be32 *p;
2636 int ret = 0;
2620 2637
2621 *uid = -2; 2638 *uid = -2;
2622 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) 2639 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2626 READ32(len); 2643 READ32(len);
2627 READ_BUF(len); 2644 READ_BUF(len);
2628 if (len < XDR_MAX_NETOBJ) { 2645 if (len < XDR_MAX_NETOBJ) {
2629 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0) 2646 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
2647 ret = NFS_ATTR_FATTR_OWNER;
2648 else
2630 dprintk("%s: nfs_map_name_to_uid failed!\n", 2649 dprintk("%s: nfs_map_name_to_uid failed!\n",
2631 __func__); 2650 __func__);
2632 } else 2651 } else
@@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2635 bitmap[1] &= ~FATTR4_WORD1_OWNER; 2654 bitmap[1] &= ~FATTR4_WORD1_OWNER;
2636 } 2655 }
2637 dprintk("%s: uid=%d\n", __func__, (int)*uid); 2656 dprintk("%s: uid=%d\n", __func__, (int)*uid);
2638 return 0; 2657 return ret;
2639} 2658}
2640 2659
2641static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) 2660static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
2642{ 2661{
2643 uint32_t len; 2662 uint32_t len;
2644 __be32 *p; 2663 __be32 *p;
2664 int ret = 0;
2645 2665
2646 *gid = -2; 2666 *gid = -2;
2647 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) 2667 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2651 READ32(len); 2671 READ32(len);
2652 READ_BUF(len); 2672 READ_BUF(len);
2653 if (len < XDR_MAX_NETOBJ) { 2673 if (len < XDR_MAX_NETOBJ) {
2654 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0) 2674 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
2675 ret = NFS_ATTR_FATTR_GROUP;
2676 else
2655 dprintk("%s: nfs_map_group_to_gid failed!\n", 2677 dprintk("%s: nfs_map_group_to_gid failed!\n",
2656 __func__); 2678 __func__);
2657 } else 2679 } else
@@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2660 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; 2682 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
2661 } 2683 }
2662 dprintk("%s: gid=%d\n", __func__, (int)*gid); 2684 dprintk("%s: gid=%d\n", __func__, (int)*gid);
2663 return 0; 2685 return ret;
2664} 2686}
2665 2687
2666static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) 2688static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
2667{ 2689{
2668 uint32_t major = 0, minor = 0; 2690 uint32_t major = 0, minor = 0;
2669 __be32 *p; 2691 __be32 *p;
2692 int ret = 0;
2670 2693
2671 *rdev = MKDEV(0,0); 2694 *rdev = MKDEV(0,0);
2672 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) 2695 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
2681 if (MAJOR(tmp) == major && MINOR(tmp) == minor) 2704 if (MAJOR(tmp) == major && MINOR(tmp) == minor)
2682 *rdev = tmp; 2705 *rdev = tmp;
2683 bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; 2706 bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
2707 ret = NFS_ATTR_FATTR_RDEV;
2684 } 2708 }
2685 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); 2709 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
2686 return 0; 2710 return ret;
2687} 2711}
2688 2712
2689static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2713static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2740static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) 2764static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
2741{ 2765{
2742 __be32 *p; 2766 __be32 *p;
2767 int ret = 0;
2743 2768
2744 *used = 0; 2769 *used = 0;
2745 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) 2770 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
2748 READ_BUF(8); 2773 READ_BUF(8);
2749 READ64(*used); 2774 READ64(*used);
2750 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; 2775 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
2776 ret = NFS_ATTR_FATTR_SPACE_USED;
2751 } 2777 }
2752 dprintk("%s: space used=%Lu\n", __func__, 2778 dprintk("%s: space used=%Lu\n", __func__,
2753 (unsigned long long)*used); 2779 (unsigned long long)*used);
2754 return 0; 2780 return ret;
2755} 2781}
2756 2782
2757static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) 2783static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
2778 return -EIO; 2804 return -EIO;
2779 if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { 2805 if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
2780 status = decode_attr_time(xdr, time); 2806 status = decode_attr_time(xdr, time);
2807 if (status == 0)
2808 status = NFS_ATTR_FATTR_ATIME;
2781 bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; 2809 bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
2782 } 2810 }
2783 dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); 2811 dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
2794 return -EIO; 2822 return -EIO;
2795 if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { 2823 if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
2796 status = decode_attr_time(xdr, time); 2824 status = decode_attr_time(xdr, time);
2825 if (status == 0)
2826 status = NFS_ATTR_FATTR_CTIME;
2797 bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; 2827 bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
2798 } 2828 }
2799 dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); 2829 dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
2810 return -EIO; 2840 return -EIO;
2811 if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { 2841 if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
2812 status = decode_attr_time(xdr, time); 2842 status = decode_attr_time(xdr, time);
2843 if (status == 0)
2844 status = NFS_ATTR_FATTR_MTIME;
2813 bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; 2845 bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
2814 } 2846 }
2815 dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); 2847 dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
2994 uint32_t attrlen, 3026 uint32_t attrlen,
2995 bitmap[2] = {0}, 3027 bitmap[2] = {0},
2996 type; 3028 type;
2997 int status, fmode = 0; 3029 int status;
3030 umode_t fmode = 0;
2998 uint64_t fileid; 3031 uint64_t fileid;
2999 3032
3000 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 3033 status = decode_op_hdr(xdr, OP_GETATTR);
3001 goto xdr_error; 3034 if (status < 0)
3002 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
3003 goto xdr_error; 3035 goto xdr_error;
3004 3036
3005 fattr->bitmap[0] = bitmap[0]; 3037 status = decode_attr_bitmap(xdr, bitmap);
3006 fattr->bitmap[1] = bitmap[1]; 3038 if (status < 0)
3039 goto xdr_error;
3007 3040
3008 if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) 3041 status = decode_attr_length(xdr, &attrlen, &savep);
3042 if (status < 0)
3009 goto xdr_error; 3043 goto xdr_error;
3010 3044
3011 3045
3012 if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) 3046 status = decode_attr_type(xdr, bitmap, &type);
3047 if (status < 0)
3013 goto xdr_error; 3048 goto xdr_error;
3014 fattr->type = nfs_type2fmt[type].nfs2type; 3049 fattr->mode = 0;
3015 fmode = nfs_type2fmt[type].mode; 3050 if (status != 0) {
3051 fattr->mode |= nfs_type2fmt[type];
3052 fattr->valid |= status;
3053 }
3016 3054
3017 if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) 3055 status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
3056 if (status < 0)
3018 goto xdr_error; 3057 goto xdr_error;
3019 if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) 3058 fattr->valid |= status;
3059
3060 status = decode_attr_size(xdr, bitmap, &fattr->size);
3061 if (status < 0)
3020 goto xdr_error; 3062 goto xdr_error;
3021 if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) 3063 fattr->valid |= status;
3064
3065 status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
3066 if (status < 0)
3022 goto xdr_error; 3067 goto xdr_error;
3023 if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) 3068 fattr->valid |= status;
3069
3070 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
3071 if (status < 0)
3024 goto xdr_error; 3072 goto xdr_error;
3025 if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, 3073 fattr->valid |= status;
3074
3075 status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
3026 struct nfs4_fs_locations, 3076 struct nfs4_fs_locations,
3027 fattr))) != 0) 3077 fattr));
3078 if (status < 0)
3028 goto xdr_error; 3079 goto xdr_error;
3029 if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) 3080 fattr->valid |= status;
3081
3082 status = decode_attr_mode(xdr, bitmap, &fmode);
3083 if (status < 0)
3030 goto xdr_error; 3084 goto xdr_error;
3031 fattr->mode |= fmode; 3085 if (status != 0) {
3032 if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) 3086 fattr->mode |= fmode;
3087 fattr->valid |= status;
3088 }
3089
3090 status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
3091 if (status < 0)
3033 goto xdr_error; 3092 goto xdr_error;
3034 if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0) 3093 fattr->valid |= status;
3094
3095 status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
3096 if (status < 0)
3035 goto xdr_error; 3097 goto xdr_error;
3036 if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0) 3098 fattr->valid |= status;
3099
3100 status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
3101 if (status < 0)
3037 goto xdr_error; 3102 goto xdr_error;
3038 if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0) 3103 fattr->valid |= status;
3104
3105 status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
3106 if (status < 0)
3039 goto xdr_error; 3107 goto xdr_error;
3040 if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0) 3108 fattr->valid |= status;
3109
3110 status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
3111 if (status < 0)
3041 goto xdr_error; 3112 goto xdr_error;
3042 if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0) 3113 fattr->valid |= status;
3114
3115 status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
3116 if (status < 0)
3043 goto xdr_error; 3117 goto xdr_error;
3044 if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0) 3118 fattr->valid |= status;
3119
3120 status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
3121 if (status < 0)
3045 goto xdr_error; 3122 goto xdr_error;
3046 if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) 3123 fattr->valid |= status;
3124
3125 status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
3126 if (status < 0)
3047 goto xdr_error; 3127 goto xdr_error;
3048 if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0) 3128 fattr->valid |= status;
3129
3130 status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
3131 if (status < 0)
3049 goto xdr_error; 3132 goto xdr_error;
3050 if (fattr->fileid == 0 && fileid != 0) 3133 if (status != 0 && !(fattr->valid & status)) {
3051 fattr->fileid = fileid; 3134 fattr->fileid = fileid;
3052 if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) 3135 fattr->valid |= status;
3053 fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; 3136 }
3137
3138 status = verify_attr_len(xdr, savep, attrlen);
3054xdr_error: 3139xdr_error:
3055 dprintk("%s: xdr returned %d\n", __func__, -status); 3140 dprintk("%s: xdr returned %d\n", __func__, -status);
3056 return status; 3141 return status;
@@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
4078 status = decode_setattr(&xdr, res); 4163 status = decode_setattr(&xdr, res);
4079 if (status) 4164 if (status)
4080 goto out; 4165 goto out;
4081 status = decode_getfattr(&xdr, res->fattr, res->server); 4166 decode_getfattr(&xdr, res->fattr, res->server);
4082 if (status == NFS4ERR_DELAY)
4083 status = 0;
4084out: 4167out:
4085 return status; 4168 return status;
4086} 4169}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70..e2975939126 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
176 kref_put(&req->wb_kref, nfs_free_request); 176 kref_put(&req->wb_kref, nfs_free_request);
177} 177}
178 178
179static int nfs_wait_bit_killable(void *word)
180{
181 int ret = 0;
182
183 if (fatal_signal_pending(current))
184 ret = -ERESTARTSYS;
185 else
186 schedule();
187 return ret;
188}
189
190/** 179/**
191 * nfs_wait_on_request - Wait for a request to complete. 180 * nfs_wait_on_request - Wait for a request to complete.
192 * @req: request to wait upon. 181 * @req: request to wait upon.
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7..7be72d90d49 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
663 .commit_setup = nfs_proc_commit_setup, 663 .commit_setup = nfs_proc_commit_setup,
664 .lock = nfs_proc_lock, 664 .lock = nfs_proc_lock,
665 .lock_check_bounds = nfs_lock_check_bounds, 665 .lock_check_bounds = nfs_lock_check_bounds,
666 .close_context = nfs_close_context,
666}; 667};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786d..0942fcbbad3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1018,6 +1018,7 @@ static int nfs_parse_mount_options(char *raw,
1018 case Opt_rdma: 1018 case Opt_rdma:
1019 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ 1019 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
1020 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1020 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1021 xprt_load_transport(p);
1021 break; 1022 break;
1022 case Opt_acl: 1023 case Opt_acl:
1023 mnt->flags &= ~NFS_MOUNT_NOACL; 1024 mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1205,12 +1206,14 @@ static int nfs_parse_mount_options(char *raw,
1205 /* vector side protocols to TCP */ 1206 /* vector side protocols to TCP */
1206 mnt->flags |= NFS_MOUNT_TCP; 1207 mnt->flags |= NFS_MOUNT_TCP;
1207 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1208 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1209 xprt_load_transport(string);
1208 break; 1210 break;
1209 default: 1211 default:
1210 errors++; 1212 errors++;
1211 dfprintk(MOUNT, "NFS: unrecognized " 1213 dfprintk(MOUNT, "NFS: unrecognized "
1212 "transport protocol\n"); 1214 "transport protocol\n");
1213 } 1215 }
1216 kfree(string);
1214 break; 1217 break;
1215 case Opt_mountproto: 1218 case Opt_mountproto:
1216 string = match_strdup(args); 1219 string = match_strdup(args);
@@ -1218,7 +1221,6 @@ static int nfs_parse_mount_options(char *raw,
1218 goto out_nomem; 1221 goto out_nomem;
1219 token = match_token(string, 1222 token = match_token(string,
1220 nfs_xprt_protocol_tokens, args); 1223 nfs_xprt_protocol_tokens, args);
1221 kfree(string);
1222 1224
1223 switch (token) { 1225 switch (token) {
1224 case Opt_xprt_udp: 1226 case Opt_xprt_udp:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc..e560a78995a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
313int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 313int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
314{ 314{
315 struct inode *inode = mapping->host; 315 struct inode *inode = mapping->host;
316 unsigned long *bitlock = &NFS_I(inode)->flags;
316 struct nfs_pageio_descriptor pgio; 317 struct nfs_pageio_descriptor pgio;
317 int err; 318 int err;
318 319
320 /* Stop dirtying of new pages while we sync */
321 err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
322 nfs_wait_bit_killable, TASK_KILLABLE);
323 if (err)
324 goto out_err;
325
319 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 326 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
320 327
321 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); 328 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
322 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); 329 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
323 nfs_pageio_complete(&pgio); 330 nfs_pageio_complete(&pgio);
331
332 clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
333 smp_mb__after_clear_bit();
334 wake_up_bit(bitlock, NFS_INO_FLUSHING);
335
324 if (err < 0) 336 if (err < 0)
325 return err; 337 goto out_err;
326 if (pgio.pg_error < 0) 338 err = pgio.pg_error;
327 return pgio.pg_error; 339 if (err < 0)
340 goto out_err;
328 return 0; 341 return 0;
342out_err:
343 return err;
329} 344}
330 345
331/* 346/*
@@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req)
404 struct nfs_inode *nfsi = NFS_I(inode); 419 struct nfs_inode *nfsi = NFS_I(inode);
405 420
406 spin_lock(&inode->i_lock); 421 spin_lock(&inode->i_lock);
407 nfsi->ncommit++;
408 set_bit(PG_CLEAN, &(req)->wb_flags); 422 set_bit(PG_CLEAN, &(req)->wb_flags);
409 radix_tree_tag_set(&nfsi->nfs_page_tree, 423 radix_tree_tag_set(&nfsi->nfs_page_tree,
410 req->wb_index, 424 req->wb_index,
@@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
524} 538}
525 539
526#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 540#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
541static int
542nfs_need_commit(struct nfs_inode *nfsi)
543{
544 return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
545}
546
527/* 547/*
528 * nfs_scan_commit - Scan an inode for commit requests 548 * nfs_scan_commit - Scan an inode for commit requests
529 * @inode: NFS inode to scan 549 * @inode: NFS inode to scan
@@ -538,16 +558,18 @@ static int
538nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 558nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
539{ 559{
540 struct nfs_inode *nfsi = NFS_I(inode); 560 struct nfs_inode *nfsi = NFS_I(inode);
541 int res = 0;
542 561
543 if (nfsi->ncommit != 0) { 562 if (!nfs_need_commit(nfsi))
544 res = nfs_scan_list(nfsi, dst, idx_start, npages, 563 return 0;
545 NFS_PAGE_TAG_COMMIT); 564
546 nfsi->ncommit -= res; 565 return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
547 }
548 return res;
549} 566}
550#else 567#else
568static inline int nfs_need_commit(struct nfs_inode *nfsi)
569{
570 return 0;
571}
572
551static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 573static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
552{ 574{
553 return 0; 575 return 0;
@@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
820 data->args.stable = NFS_UNSTABLE; 842 data->args.stable = NFS_UNSTABLE;
821 if (how & FLUSH_STABLE) { 843 if (how & FLUSH_STABLE) {
822 data->args.stable = NFS_DATA_SYNC; 844 data->args.stable = NFS_DATA_SYNC;
823 if (!NFS_I(inode)->ncommit) 845 if (!nfs_need_commit(NFS_I(inode)))
824 data->args.stable = NFS_FILE_SYNC; 846 data->args.stable = NFS_FILE_SYNC;
825 } 847 }
826 848
@@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1425{ 1447{
1426 struct writeback_control wbc = { 1448 struct writeback_control wbc = {
1427 .bdi = mapping->backing_dev_info, 1449 .bdi = mapping->backing_dev_info,
1428 .sync_mode = WB_SYNC_NONE, 1450 .sync_mode = WB_SYNC_ALL,
1429 .nr_to_write = LONG_MAX, 1451 .nr_to_write = LONG_MAX,
1430 .range_start = 0, 1452 .range_start = 0,
1431 .range_end = LLONG_MAX, 1453 .range_end = LLONG_MAX,
1432 .for_writepages = 1, 1454 .for_writepages = 1,
1433 }; 1455 };
1434 int ret;
1435 1456
1436 ret = __nfs_write_mapping(mapping, &wbc, how);
1437 if (ret < 0)
1438 return ret;
1439 wbc.sync_mode = WB_SYNC_ALL;
1440 return __nfs_write_mapping(mapping, &wbc, how); 1457 return __nfs_write_mapping(mapping, &wbc, how);
1441} 1458}
1442 1459
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce..a4ed8644d69 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -938,10 +938,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
938 char transport[16]; 938 char transport[16];
939 int port; 939 int port;
940 if (sscanf(buf, "%15s %4d", transport, &port) == 2) { 940 if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
941 if (port < 1 || port > 65535)
942 return -EINVAL;
941 err = nfsd_create_serv(); 943 err = nfsd_create_serv();
942 if (!err) { 944 if (!err) {
943 err = svc_create_xprt(nfsd_serv, 945 err = svc_create_xprt(nfsd_serv,
944 transport, port, 946 transport, PF_INET, port,
945 SVC_SOCK_ANONYMOUS); 947 SVC_SOCK_ANONYMOUS);
946 if (err == -ENOENT) 948 if (err == -ENOENT)
947 /* Give a reasonable perror msg for 949 /* Give a reasonable perror msg for
@@ -960,7 +962,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
960 char transport[16]; 962 char transport[16];
961 int port; 963 int port;
962 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { 964 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
963 if (port == 0) 965 if (port < 1 || port > 65535)
964 return -EINVAL; 966 return -EINVAL;
965 if (nfsd_serv) { 967 if (nfsd_serv) {
966 xprt = svc_find_xprt(nfsd_serv, transport, 968 xprt = svc_find_xprt(nfsd_serv, transport,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa..bc3567bab8c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,7 +229,6 @@ int nfsd_create_serv(void)
229 229
230 atomic_set(&nfsd_busy, 0); 230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 AF_INET,
233 nfsd_last_thread, nfsd, THIS_MODULE); 232 nfsd_last_thread, nfsd, THIS_MODULE);
234 if (nfsd_serv == NULL) 233 if (nfsd_serv == NULL)
235 err = -ENOMEM; 234 err = -ENOMEM;
@@ -244,7 +243,7 @@ static int nfsd_init_socks(int port)
244 if (!list_empty(&nfsd_serv->sv_permsocks)) 243 if (!list_empty(&nfsd_serv->sv_permsocks))
245 return 0; 244 return 0;
246 245
247 error = svc_create_xprt(nfsd_serv, "udp", port, 246 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
248 SVC_SOCK_DEFAULTS); 247 SVC_SOCK_DEFAULTS);
249 if (error < 0) 248 if (error < 0)
250 return error; 249 return error;
@@ -253,7 +252,7 @@ static int nfsd_init_socks(int port)
253 if (error < 0) 252 if (error < 0)
254 return error; 253 return error;
255 254
256 error = svc_create_xprt(nfsd_serv, "tcp", port, 255 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
257 SVC_SOCK_DEFAULTS); 256 SVC_SOCK_DEFAULTS);
258 if (error < 0) 257 if (error < 0)
259 return error; 258 return error;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 34314b33dbd..5a9e34475e3 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -32,8 +32,8 @@
32/** 32/**
33 * The little endian Unicode string $I30 as a global constant. 33 * The little endian Unicode string $I30 as a global constant.
34 */ 34 */
35ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'), 35ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
36 const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 }; 36 cpu_to_le16('3'), cpu_to_le16('0'), 0 };
37 37
38/** 38/**
39 * ntfs_lookup_inode_by_name - find an inode in a directory given its name 39 * ntfs_lookup_inode_by_name - find an inode in a directory given its name
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 86bef156cf0..82c5085559c 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi)
1975 goto em_put_err_out; 1975 goto em_put_err_out;
1976 next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + 1976 next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
1977 le16_to_cpu(al_entry->length)); 1977 le16_to_cpu(al_entry->length));
1978 if (le32_to_cpu(al_entry->type) > 1978 if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
1979 const_le32_to_cpu(AT_DATA))
1980 goto em_put_err_out; 1979 goto em_put_err_out;
1981 if (AT_DATA != al_entry->type) 1980 if (AT_DATA != al_entry->type)
1982 continue; 1981 continue;
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 1e383328ece..50931b1ce4b 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -31,19 +31,8 @@
31 31
32#include "types.h" 32#include "types.h"
33 33
34/*
35 * Constant endianness conversion defines.
36 */
37#define const_le16_to_cpu(x) __constant_le16_to_cpu(x)
38#define const_le32_to_cpu(x) __constant_le32_to_cpu(x)
39#define const_le64_to_cpu(x) __constant_le64_to_cpu(x)
40
41#define const_cpu_to_le16(x) __constant_cpu_to_le16(x)
42#define const_cpu_to_le32(x) __constant_cpu_to_le32(x)
43#define const_cpu_to_le64(x) __constant_cpu_to_le64(x)
44
45/* The NTFS oem_id "NTFS " */ 34/* The NTFS oem_id "NTFS " */
46#define magicNTFS const_cpu_to_le64(0x202020205346544eULL) 35#define magicNTFS cpu_to_le64(0x202020205346544eULL)
47 36
48/* 37/*
49 * Location of bootsector on partition: 38 * Location of bootsector on partition:
@@ -114,25 +103,25 @@ typedef struct {
114 */ 103 */
115enum { 104enum {
116 /* Found in $MFT/$DATA. */ 105 /* Found in $MFT/$DATA. */
117 magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */ 106 magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
118 magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */ 107 magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
119 magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ 108 magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
120 109
121 /* Found in $LogFile/$DATA. */ 110 /* Found in $LogFile/$DATA. */
122 magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */ 111 magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
123 magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */ 112 magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
124 113
125 /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ 114 /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */
126 magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ 115 magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
127 116
128 /* Found in all ntfs record containing records. */ 117 /* Found in all ntfs record containing records. */
129 magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector 118 magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
130 transfer was detected. */ 119 transfer was detected. */
131 /* 120 /*
132 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is 121 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
133 * thus not initialized. Page must be initialized before using it. 122 * thus not initialized. Page must be initialized before using it.
134 */ 123 */
135 magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */ 124 magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
136}; 125};
137 126
138typedef le32 NTFS_RECORD_TYPE; 127typedef le32 NTFS_RECORD_TYPE;
@@ -258,8 +247,8 @@ typedef enum {
258 * information about the mft record in which they are present. 247 * information about the mft record in which they are present.
259 */ 248 */
260enum { 249enum {
261 MFT_RECORD_IN_USE = const_cpu_to_le16(0x0001), 250 MFT_RECORD_IN_USE = cpu_to_le16(0x0001),
262 MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002), 251 MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
263} __attribute__ ((__packed__)); 252} __attribute__ ((__packed__));
264 253
265typedef le16 MFT_RECORD_FLAGS; 254typedef le16 MFT_RECORD_FLAGS;
@@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS;
309 * Note: The _LE versions will return a CPU endian formatted value! 298 * Note: The _LE versions will return a CPU endian formatted value!
310 */ 299 */
311#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL 300#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
312#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU) 301#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
313 302
314typedef u64 MFT_REF; 303typedef u64 MFT_REF;
315typedef le64 leMFT_REF; 304typedef le64 leMFT_REF;
@@ -477,25 +466,25 @@ typedef struct {
477 * a revealing choice of symbol I do not know what is... (-; 466 * a revealing choice of symbol I do not know what is... (-;
478 */ 467 */
479enum { 468enum {
480 AT_UNUSED = const_cpu_to_le32( 0), 469 AT_UNUSED = cpu_to_le32( 0),
481 AT_STANDARD_INFORMATION = const_cpu_to_le32( 0x10), 470 AT_STANDARD_INFORMATION = cpu_to_le32( 0x10),
482 AT_ATTRIBUTE_LIST = const_cpu_to_le32( 0x20), 471 AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20),
483 AT_FILE_NAME = const_cpu_to_le32( 0x30), 472 AT_FILE_NAME = cpu_to_le32( 0x30),
484 AT_OBJECT_ID = const_cpu_to_le32( 0x40), 473 AT_OBJECT_ID = cpu_to_le32( 0x40),
485 AT_SECURITY_DESCRIPTOR = const_cpu_to_le32( 0x50), 474 AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50),
486 AT_VOLUME_NAME = const_cpu_to_le32( 0x60), 475 AT_VOLUME_NAME = cpu_to_le32( 0x60),
487 AT_VOLUME_INFORMATION = const_cpu_to_le32( 0x70), 476 AT_VOLUME_INFORMATION = cpu_to_le32( 0x70),
488 AT_DATA = const_cpu_to_le32( 0x80), 477 AT_DATA = cpu_to_le32( 0x80),
489 AT_INDEX_ROOT = const_cpu_to_le32( 0x90), 478 AT_INDEX_ROOT = cpu_to_le32( 0x90),
490 AT_INDEX_ALLOCATION = const_cpu_to_le32( 0xa0), 479 AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0),
491 AT_BITMAP = const_cpu_to_le32( 0xb0), 480 AT_BITMAP = cpu_to_le32( 0xb0),
492 AT_REPARSE_POINT = const_cpu_to_le32( 0xc0), 481 AT_REPARSE_POINT = cpu_to_le32( 0xc0),
493 AT_EA_INFORMATION = const_cpu_to_le32( 0xd0), 482 AT_EA_INFORMATION = cpu_to_le32( 0xd0),
494 AT_EA = const_cpu_to_le32( 0xe0), 483 AT_EA = cpu_to_le32( 0xe0),
495 AT_PROPERTY_SET = const_cpu_to_le32( 0xf0), 484 AT_PROPERTY_SET = cpu_to_le32( 0xf0),
496 AT_LOGGED_UTILITY_STREAM = const_cpu_to_le32( 0x100), 485 AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100),
497 AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32( 0x1000), 486 AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000),
498 AT_END = const_cpu_to_le32(0xffffffff) 487 AT_END = cpu_to_le32(0xffffffff)
499}; 488};
500 489
501typedef le32 ATTR_TYPE; 490typedef le32 ATTR_TYPE;
@@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE;
539 * equal then the second le32 values would be compared, etc. 528 * equal then the second le32 values would be compared, etc.
540 */ 529 */
541enum { 530enum {
542 COLLATION_BINARY = const_cpu_to_le32(0x00), 531 COLLATION_BINARY = cpu_to_le32(0x00),
543 COLLATION_FILE_NAME = const_cpu_to_le32(0x01), 532 COLLATION_FILE_NAME = cpu_to_le32(0x01),
544 COLLATION_UNICODE_STRING = const_cpu_to_le32(0x02), 533 COLLATION_UNICODE_STRING = cpu_to_le32(0x02),
545 COLLATION_NTOFS_ULONG = const_cpu_to_le32(0x10), 534 COLLATION_NTOFS_ULONG = cpu_to_le32(0x10),
546 COLLATION_NTOFS_SID = const_cpu_to_le32(0x11), 535 COLLATION_NTOFS_SID = cpu_to_le32(0x11),
547 COLLATION_NTOFS_SECURITY_HASH = const_cpu_to_le32(0x12), 536 COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12),
548 COLLATION_NTOFS_ULONGS = const_cpu_to_le32(0x13), 537 COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13),
549}; 538};
550 539
551typedef le32 COLLATION_RULE; 540typedef le32 COLLATION_RULE;
@@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE;
559 * NT4. 548 * NT4.
560 */ 549 */
561enum { 550enum {
562 ATTR_DEF_INDEXABLE = const_cpu_to_le32(0x02), /* Attribute can be 551 ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be
563 indexed. */ 552 indexed. */
564 ATTR_DEF_MULTIPLE = const_cpu_to_le32(0x04), /* Attribute type 553 ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type
565 can be present multiple times in the 554 can be present multiple times in the
566 mft records of an inode. */ 555 mft records of an inode. */
567 ATTR_DEF_NOT_ZERO = const_cpu_to_le32(0x08), /* Attribute value 556 ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value
568 must contain at least one non-zero 557 must contain at least one non-zero
569 byte. */ 558 byte. */
570 ATTR_DEF_INDEXED_UNIQUE = const_cpu_to_le32(0x10), /* Attribute must be 559 ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be
571 indexed and the attribute value must be 560 indexed and the attribute value must be
572 unique for the attribute type in all of 561 unique for the attribute type in all of
573 the mft records of an inode. */ 562 the mft records of an inode. */
574 ATTR_DEF_NAMED_UNIQUE = const_cpu_to_le32(0x20), /* Attribute must be 563 ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be
575 named and the name must be unique for 564 named and the name must be unique for
576 the attribute type in all of the mft 565 the attribute type in all of the mft
577 records of an inode. */ 566 records of an inode. */
578 ATTR_DEF_RESIDENT = const_cpu_to_le32(0x40), /* Attribute must be 567 ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be
579 resident. */ 568 resident. */
580 ATTR_DEF_ALWAYS_LOG = const_cpu_to_le32(0x80), /* Always log 569 ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log
581 modifications to this attribute, 570 modifications to this attribute,
582 regardless of whether it is resident or 571 regardless of whether it is resident or
583 non-resident. Without this, only log 572 non-resident. Without this, only log
@@ -614,12 +603,12 @@ typedef struct {
614 * Attribute flags (16-bit). 603 * Attribute flags (16-bit).
615 */ 604 */
616enum { 605enum {
617 ATTR_IS_COMPRESSED = const_cpu_to_le16(0x0001), 606 ATTR_IS_COMPRESSED = cpu_to_le16(0x0001),
618 ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method 607 ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
619 mask. Also, first 608 mask. Also, first
620 illegal value. */ 609 illegal value. */
621 ATTR_IS_ENCRYPTED = const_cpu_to_le16(0x4000), 610 ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000),
622 ATTR_IS_SPARSE = const_cpu_to_le16(0x8000), 611 ATTR_IS_SPARSE = cpu_to_le16(0x8000),
623} __attribute__ ((__packed__)); 612} __attribute__ ((__packed__));
624 613
625typedef le16 ATTR_FLAGS; 614typedef le16 ATTR_FLAGS;
@@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC;
811 * flags appear in all of the above. 800 * flags appear in all of the above.
812 */ 801 */
813enum { 802enum {
814 FILE_ATTR_READONLY = const_cpu_to_le32(0x00000001), 803 FILE_ATTR_READONLY = cpu_to_le32(0x00000001),
815 FILE_ATTR_HIDDEN = const_cpu_to_le32(0x00000002), 804 FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002),
816 FILE_ATTR_SYSTEM = const_cpu_to_le32(0x00000004), 805 FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004),
817 /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */ 806 /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */
818 807
819 FILE_ATTR_DIRECTORY = const_cpu_to_le32(0x00000010), 808 FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010),
820 /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is 809 /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is
821 reserved for the DOS SUBDIRECTORY flag. */ 810 reserved for the DOS SUBDIRECTORY flag. */
822 FILE_ATTR_ARCHIVE = const_cpu_to_le32(0x00000020), 811 FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020),
823 FILE_ATTR_DEVICE = const_cpu_to_le32(0x00000040), 812 FILE_ATTR_DEVICE = cpu_to_le32(0x00000040),
824 FILE_ATTR_NORMAL = const_cpu_to_le32(0x00000080), 813 FILE_ATTR_NORMAL = cpu_to_le32(0x00000080),
825 814
826 FILE_ATTR_TEMPORARY = const_cpu_to_le32(0x00000100), 815 FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100),
827 FILE_ATTR_SPARSE_FILE = const_cpu_to_le32(0x00000200), 816 FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200),
828 FILE_ATTR_REPARSE_POINT = const_cpu_to_le32(0x00000400), 817 FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400),
829 FILE_ATTR_COMPRESSED = const_cpu_to_le32(0x00000800), 818 FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800),
830 819
831 FILE_ATTR_OFFLINE = const_cpu_to_le32(0x00001000), 820 FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000),
832 FILE_ATTR_NOT_CONTENT_INDEXED = const_cpu_to_le32(0x00002000), 821 FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000),
833 FILE_ATTR_ENCRYPTED = const_cpu_to_le32(0x00004000), 822 FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000),
834 823
835 FILE_ATTR_VALID_FLAGS = const_cpu_to_le32(0x00007fb7), 824 FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7),
836 /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the 825 /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
837 FILE_ATTR_DEVICE and preserves everything else. This mask is used 826 FILE_ATTR_DEVICE and preserves everything else. This mask is used
838 to obtain all flags that are valid for reading. */ 827 to obtain all flags that are valid for reading. */
839 FILE_ATTR_VALID_SET_FLAGS = const_cpu_to_le32(0x000031a7), 828 FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7),
840 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the 829 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
841 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, 830 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
842 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask 831 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
@@ -846,11 +835,11 @@ enum {
846 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION 835 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
847 * attribute of an mft record. 836 * attribute of an mft record.
848 */ 837 */
849 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000), 838 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000),
850 /* Note, this is a copy of the corresponding bit from the mft record, 839 /* Note, this is a copy of the corresponding bit from the mft record,
851 telling us whether this is a directory or not, i.e. whether it has 840 telling us whether this is a directory or not, i.e. whether it has
852 an index root attribute or not. */ 841 an index root attribute or not. */
853 FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000), 842 FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000),
854 /* Note, this is a copy of the corresponding bit from the mft record, 843 /* Note, this is a copy of the corresponding bit from the mft record,
855 telling us whether this file has a view index present (eg. object id 844 telling us whether this file has a view index present (eg. object id
856 index, quota index, one of the security indexes or the encrypting 845 index, quota index, one of the security indexes or the encrypting
@@ -1446,42 +1435,42 @@ enum {
1446 /* Specific rights for files and directories are as follows: */ 1435 /* Specific rights for files and directories are as follows: */
1447 1436
1448 /* Right to read data from the file. (FILE) */ 1437 /* Right to read data from the file. (FILE) */
1449 FILE_READ_DATA = const_cpu_to_le32(0x00000001), 1438 FILE_READ_DATA = cpu_to_le32(0x00000001),
1450 /* Right to list contents of a directory. (DIRECTORY) */ 1439 /* Right to list contents of a directory. (DIRECTORY) */
1451 FILE_LIST_DIRECTORY = const_cpu_to_le32(0x00000001), 1440 FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001),
1452 1441
1453 /* Right to write data to the file. (FILE) */ 1442 /* Right to write data to the file. (FILE) */
1454 FILE_WRITE_DATA = const_cpu_to_le32(0x00000002), 1443 FILE_WRITE_DATA = cpu_to_le32(0x00000002),
1455 /* Right to create a file in the directory. (DIRECTORY) */ 1444 /* Right to create a file in the directory. (DIRECTORY) */
1456 FILE_ADD_FILE = const_cpu_to_le32(0x00000002), 1445 FILE_ADD_FILE = cpu_to_le32(0x00000002),
1457 1446
1458 /* Right to append data to the file. (FILE) */ 1447 /* Right to append data to the file. (FILE) */
1459 FILE_APPEND_DATA = const_cpu_to_le32(0x00000004), 1448 FILE_APPEND_DATA = cpu_to_le32(0x00000004),
1460 /* Right to create a subdirectory. (DIRECTORY) */ 1449 /* Right to create a subdirectory. (DIRECTORY) */
1461 FILE_ADD_SUBDIRECTORY = const_cpu_to_le32(0x00000004), 1450 FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004),
1462 1451
1463 /* Right to read extended attributes. (FILE/DIRECTORY) */ 1452 /* Right to read extended attributes. (FILE/DIRECTORY) */
1464 FILE_READ_EA = const_cpu_to_le32(0x00000008), 1453 FILE_READ_EA = cpu_to_le32(0x00000008),
1465 1454
1466 /* Right to write extended attributes. (FILE/DIRECTORY) */ 1455 /* Right to write extended attributes. (FILE/DIRECTORY) */
1467 FILE_WRITE_EA = const_cpu_to_le32(0x00000010), 1456 FILE_WRITE_EA = cpu_to_le32(0x00000010),
1468 1457
1469 /* Right to execute a file. (FILE) */ 1458 /* Right to execute a file. (FILE) */
1470 FILE_EXECUTE = const_cpu_to_le32(0x00000020), 1459 FILE_EXECUTE = cpu_to_le32(0x00000020),
1471 /* Right to traverse the directory. (DIRECTORY) */ 1460 /* Right to traverse the directory. (DIRECTORY) */
1472 FILE_TRAVERSE = const_cpu_to_le32(0x00000020), 1461 FILE_TRAVERSE = cpu_to_le32(0x00000020),
1473 1462
1474 /* 1463 /*
1475 * Right to delete a directory and all the files it contains (its 1464 * Right to delete a directory and all the files it contains (its
1476 * children), even if the files are read-only. (DIRECTORY) 1465 * children), even if the files are read-only. (DIRECTORY)
1477 */ 1466 */
1478 FILE_DELETE_CHILD = const_cpu_to_le32(0x00000040), 1467 FILE_DELETE_CHILD = cpu_to_le32(0x00000040),
1479 1468
1480 /* Right to read file attributes. (FILE/DIRECTORY) */ 1469 /* Right to read file attributes. (FILE/DIRECTORY) */
1481 FILE_READ_ATTRIBUTES = const_cpu_to_le32(0x00000080), 1470 FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080),
1482 1471
1483 /* Right to change file attributes. (FILE/DIRECTORY) */ 1472 /* Right to change file attributes. (FILE/DIRECTORY) */
1484 FILE_WRITE_ATTRIBUTES = const_cpu_to_le32(0x00000100), 1473 FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100),
1485 1474
1486 /* 1475 /*
1487 * The standard rights (bits 16 to 23). These are independent of the 1476 * The standard rights (bits 16 to 23). These are independent of the
@@ -1489,27 +1478,27 @@ enum {
1489 */ 1478 */
1490 1479
1491 /* Right to delete the object. */ 1480 /* Right to delete the object. */
1492 DELETE = const_cpu_to_le32(0x00010000), 1481 DELETE = cpu_to_le32(0x00010000),
1493 1482
1494 /* 1483 /*
1495 * Right to read the information in the object's security descriptor, 1484 * Right to read the information in the object's security descriptor,
1496 * not including the information in the SACL, i.e. right to read the 1485 * not including the information in the SACL, i.e. right to read the
1497 * security descriptor and owner. 1486 * security descriptor and owner.
1498 */ 1487 */
1499 READ_CONTROL = const_cpu_to_le32(0x00020000), 1488 READ_CONTROL = cpu_to_le32(0x00020000),
1500 1489
1501 /* Right to modify the DACL in the object's security descriptor. */ 1490 /* Right to modify the DACL in the object's security descriptor. */
1502 WRITE_DAC = const_cpu_to_le32(0x00040000), 1491 WRITE_DAC = cpu_to_le32(0x00040000),
1503 1492
1504 /* Right to change the owner in the object's security descriptor. */ 1493 /* Right to change the owner in the object's security descriptor. */
1505 WRITE_OWNER = const_cpu_to_le32(0x00080000), 1494 WRITE_OWNER = cpu_to_le32(0x00080000),
1506 1495
1507 /* 1496 /*
1508 * Right to use the object for synchronization. Enables a process to 1497 * Right to use the object for synchronization. Enables a process to
1509 * wait until the object is in the signalled state. Some object types 1498 * wait until the object is in the signalled state. Some object types
1510 * do not support this access right. 1499 * do not support this access right.
1511 */ 1500 */
1512 SYNCHRONIZE = const_cpu_to_le32(0x00100000), 1501 SYNCHRONIZE = cpu_to_le32(0x00100000),
1513 1502
1514 /* 1503 /*
1515 * The following STANDARD_RIGHTS_* are combinations of the above for 1504 * The following STANDARD_RIGHTS_* are combinations of the above for
@@ -1517,25 +1506,25 @@ enum {
1517 */ 1506 */
1518 1507
1519 /* These are currently defined to READ_CONTROL. */ 1508 /* These are currently defined to READ_CONTROL. */
1520 STANDARD_RIGHTS_READ = const_cpu_to_le32(0x00020000), 1509 STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000),
1521 STANDARD_RIGHTS_WRITE = const_cpu_to_le32(0x00020000), 1510 STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000),
1522 STANDARD_RIGHTS_EXECUTE = const_cpu_to_le32(0x00020000), 1511 STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000),
1523 1512
1524 /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ 1513 /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
1525 STANDARD_RIGHTS_REQUIRED = const_cpu_to_le32(0x000f0000), 1514 STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000),
1526 1515
1527 /* 1516 /*
1528 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and 1517 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
1529 * SYNCHRONIZE access. 1518 * SYNCHRONIZE access.
1530 */ 1519 */
1531 STANDARD_RIGHTS_ALL = const_cpu_to_le32(0x001f0000), 1520 STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000),
1532 1521
1533 /* 1522 /*
1534 * The access system ACL and maximum allowed access types (bits 24 to 1523 * The access system ACL and maximum allowed access types (bits 24 to
1535 * 25, bits 26 to 27 are reserved). 1524 * 25, bits 26 to 27 are reserved).
1536 */ 1525 */
1537 ACCESS_SYSTEM_SECURITY = const_cpu_to_le32(0x01000000), 1526 ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000),
1538 MAXIMUM_ALLOWED = const_cpu_to_le32(0x02000000), 1527 MAXIMUM_ALLOWED = cpu_to_le32(0x02000000),
1539 1528
1540 /* 1529 /*
1541 * The generic rights (bits 28 to 31). These map onto the standard and 1530 * The generic rights (bits 28 to 31). These map onto the standard and
@@ -1543,10 +1532,10 @@ enum {
1543 */ 1532 */
1544 1533
1545 /* Read, write, and execute access. */ 1534 /* Read, write, and execute access. */
1546 GENERIC_ALL = const_cpu_to_le32(0x10000000), 1535 GENERIC_ALL = cpu_to_le32(0x10000000),
1547 1536
1548 /* Execute access. */ 1537 /* Execute access. */
1549 GENERIC_EXECUTE = const_cpu_to_le32(0x20000000), 1538 GENERIC_EXECUTE = cpu_to_le32(0x20000000),
1550 1539
1551 /* 1540 /*
1552 * Write access. For files, this maps onto: 1541 * Write access. For files, this maps onto:
@@ -1555,7 +1544,7 @@ enum {
1555 * For directories, the mapping has the same numerical value. See 1544 * For directories, the mapping has the same numerical value. See
1556 * above for the descriptions of the rights granted. 1545 * above for the descriptions of the rights granted.
1557 */ 1546 */
1558 GENERIC_WRITE = const_cpu_to_le32(0x40000000), 1547 GENERIC_WRITE = cpu_to_le32(0x40000000),
1559 1548
1560 /* 1549 /*
1561 * Read access. For files, this maps onto: 1550 * Read access. For files, this maps onto:
@@ -1564,7 +1553,7 @@ enum {
1564 * For directories, the mapping has the same numberical value. See 1553 * For directories, the mapping has the same numberical value. See
1565 * above for the descriptions of the rights granted. 1554 * above for the descriptions of the rights granted.
1566 */ 1555 */
1567 GENERIC_READ = const_cpu_to_le32(0x80000000), 1556 GENERIC_READ = cpu_to_le32(0x80000000),
1568}; 1557};
1569 1558
1570typedef le32 ACCESS_MASK; 1559typedef le32 ACCESS_MASK;
@@ -1604,8 +1593,8 @@ typedef struct {
1604 * The object ACE flags (32-bit). 1593 * The object ACE flags (32-bit).
1605 */ 1594 */
1606enum { 1595enum {
1607 ACE_OBJECT_TYPE_PRESENT = const_cpu_to_le32(1), 1596 ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1),
1608 ACE_INHERITED_OBJECT_TYPE_PRESENT = const_cpu_to_le32(2), 1597 ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2),
1609}; 1598};
1610 1599
1611typedef le32 OBJECT_ACE_FLAGS; 1600typedef le32 OBJECT_ACE_FLAGS;
@@ -1706,23 +1695,23 @@ typedef enum {
1706 * expressed as offsets from the beginning of the security descriptor. 1695 * expressed as offsets from the beginning of the security descriptor.
1707 */ 1696 */
1708enum { 1697enum {
1709 SE_OWNER_DEFAULTED = const_cpu_to_le16(0x0001), 1698 SE_OWNER_DEFAULTED = cpu_to_le16(0x0001),
1710 SE_GROUP_DEFAULTED = const_cpu_to_le16(0x0002), 1699 SE_GROUP_DEFAULTED = cpu_to_le16(0x0002),
1711 SE_DACL_PRESENT = const_cpu_to_le16(0x0004), 1700 SE_DACL_PRESENT = cpu_to_le16(0x0004),
1712 SE_DACL_DEFAULTED = const_cpu_to_le16(0x0008), 1701 SE_DACL_DEFAULTED = cpu_to_le16(0x0008),
1713 1702
1714 SE_SACL_PRESENT = const_cpu_to_le16(0x0010), 1703 SE_SACL_PRESENT = cpu_to_le16(0x0010),
1715 SE_SACL_DEFAULTED = const_cpu_to_le16(0x0020), 1704 SE_SACL_DEFAULTED = cpu_to_le16(0x0020),
1716 1705
1717 SE_DACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0100), 1706 SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100),
1718 SE_SACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0200), 1707 SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200),
1719 SE_DACL_AUTO_INHERITED = const_cpu_to_le16(0x0400), 1708 SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400),
1720 SE_SACL_AUTO_INHERITED = const_cpu_to_le16(0x0800), 1709 SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800),
1721 1710
1722 SE_DACL_PROTECTED = const_cpu_to_le16(0x1000), 1711 SE_DACL_PROTECTED = cpu_to_le16(0x1000),
1723 SE_SACL_PROTECTED = const_cpu_to_le16(0x2000), 1712 SE_SACL_PROTECTED = cpu_to_le16(0x2000),
1724 SE_RM_CONTROL_VALID = const_cpu_to_le16(0x4000), 1713 SE_RM_CONTROL_VALID = cpu_to_le16(0x4000),
1725 SE_SELF_RELATIVE = const_cpu_to_le16(0x8000) 1714 SE_SELF_RELATIVE = cpu_to_le16(0x8000)
1726} __attribute__ ((__packed__)); 1715} __attribute__ ((__packed__));
1727 1716
1728typedef le16 SECURITY_DESCRIPTOR_CONTROL; 1717typedef le16 SECURITY_DESCRIPTOR_CONTROL;
@@ -1910,21 +1899,21 @@ typedef struct {
1910 * Possible flags for the volume (16-bit). 1899 * Possible flags for the volume (16-bit).
1911 */ 1900 */
1912enum { 1901enum {
1913 VOLUME_IS_DIRTY = const_cpu_to_le16(0x0001), 1902 VOLUME_IS_DIRTY = cpu_to_le16(0x0001),
1914 VOLUME_RESIZE_LOG_FILE = const_cpu_to_le16(0x0002), 1903 VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002),
1915 VOLUME_UPGRADE_ON_MOUNT = const_cpu_to_le16(0x0004), 1904 VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004),
1916 VOLUME_MOUNTED_ON_NT4 = const_cpu_to_le16(0x0008), 1905 VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008),
1917 1906
1918 VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010), 1907 VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010),
1919 VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020), 1908 VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020),
1920 1909
1921 VOLUME_CHKDSK_UNDERWAY = const_cpu_to_le16(0x4000), 1910 VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000),
1922 VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000), 1911 VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000),
1923 1912
1924 VOLUME_FLAGS_MASK = const_cpu_to_le16(0xc03f), 1913 VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f),
1925 1914
1926 /* To make our life easier when checking if we must mount read-only. */ 1915 /* To make our life easier when checking if we must mount read-only. */
1927 VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0xc027), 1916 VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027),
1928} __attribute__ ((__packed__)); 1917} __attribute__ ((__packed__));
1929 1918
1930typedef le16 VOLUME_FLAGS; 1919typedef le16 VOLUME_FLAGS;
@@ -2109,26 +2098,26 @@ typedef struct {
2109 * The user quota flags. Names explain meaning. 2098 * The user quota flags. Names explain meaning.
2110 */ 2099 */
2111enum { 2100enum {
2112 QUOTA_FLAG_DEFAULT_LIMITS = const_cpu_to_le32(0x00000001), 2101 QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001),
2113 QUOTA_FLAG_LIMIT_REACHED = const_cpu_to_le32(0x00000002), 2102 QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002),
2114 QUOTA_FLAG_ID_DELETED = const_cpu_to_le32(0x00000004), 2103 QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004),
2115 2104
2116 QUOTA_FLAG_USER_MASK = const_cpu_to_le32(0x00000007), 2105 QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007),
2117 /* This is a bit mask for the user quota flags. */ 2106 /* This is a bit mask for the user quota flags. */
2118 2107
2119 /* 2108 /*
2120 * These flags are only present in the quota defaults index entry, i.e. 2109 * These flags are only present in the quota defaults index entry, i.e.
2121 * in the entry where owner_id = QUOTA_DEFAULTS_ID. 2110 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
2122 */ 2111 */
2123 QUOTA_FLAG_TRACKING_ENABLED = const_cpu_to_le32(0x00000010), 2112 QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010),
2124 QUOTA_FLAG_ENFORCEMENT_ENABLED = const_cpu_to_le32(0x00000020), 2113 QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020),
2125 QUOTA_FLAG_TRACKING_REQUESTED = const_cpu_to_le32(0x00000040), 2114 QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040),
2126 QUOTA_FLAG_LOG_THRESHOLD = const_cpu_to_le32(0x00000080), 2115 QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080),
2127 2116
2128 QUOTA_FLAG_LOG_LIMIT = const_cpu_to_le32(0x00000100), 2117 QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100),
2129 QUOTA_FLAG_OUT_OF_DATE = const_cpu_to_le32(0x00000200), 2118 QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200),
2130 QUOTA_FLAG_CORRUPT = const_cpu_to_le32(0x00000400), 2119 QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400),
2131 QUOTA_FLAG_PENDING_DELETES = const_cpu_to_le32(0x00000800), 2120 QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800),
2132}; 2121};
2133 2122
2134typedef le32 QUOTA_FLAGS; 2123typedef le32 QUOTA_FLAGS;
@@ -2172,9 +2161,9 @@ typedef struct {
2172 * Predefined owner_id values (32-bit). 2161 * Predefined owner_id values (32-bit).
2173 */ 2162 */
2174enum { 2163enum {
2175 QUOTA_INVALID_ID = const_cpu_to_le32(0x00000000), 2164 QUOTA_INVALID_ID = cpu_to_le32(0x00000000),
2176 QUOTA_DEFAULTS_ID = const_cpu_to_le32(0x00000001), 2165 QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001),
2177 QUOTA_FIRST_USER_ID = const_cpu_to_le32(0x00000100), 2166 QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100),
2178}; 2167};
2179 2168
2180/* 2169/*
@@ -2189,14 +2178,14 @@ typedef enum {
2189 * Index entry flags (16-bit). 2178 * Index entry flags (16-bit).
2190 */ 2179 */
2191enum { 2180enum {
2192 INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a 2181 INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
2193 sub-node, i.e. a reference to an index block in form of 2182 sub-node, i.e. a reference to an index block in form of
2194 a virtual cluster number (see below). */ 2183 a virtual cluster number (see below). */
2195 INDEX_ENTRY_END = const_cpu_to_le16(2), /* This signifies the last 2184 INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last
2196 entry in an index block. The index entry does not 2185 entry in an index block. The index entry does not
2197 represent a file but it can point to a sub-node. */ 2186 represent a file but it can point to a sub-node. */
2198 2187
2199 INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force 2188 INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
2200 enum bit width to 16-bit. */ 2189 enum bit width to 16-bit. */
2201} __attribute__ ((__packed__)); 2190} __attribute__ ((__packed__));
2202 2191
@@ -2334,26 +2323,26 @@ typedef struct {
2334 * These are the predefined reparse point tags: 2323 * These are the predefined reparse point tags:
2335 */ 2324 */
2336enum { 2325enum {
2337 IO_REPARSE_TAG_IS_ALIAS = const_cpu_to_le32(0x20000000), 2326 IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000),
2338 IO_REPARSE_TAG_IS_HIGH_LATENCY = const_cpu_to_le32(0x40000000), 2327 IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000),
2339 IO_REPARSE_TAG_IS_MICROSOFT = const_cpu_to_le32(0x80000000), 2328 IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000),
2340 2329
2341 IO_REPARSE_TAG_RESERVED_ZERO = const_cpu_to_le32(0x00000000), 2330 IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000),
2342 IO_REPARSE_TAG_RESERVED_ONE = const_cpu_to_le32(0x00000001), 2331 IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001),
2343 IO_REPARSE_TAG_RESERVED_RANGE = const_cpu_to_le32(0x00000001), 2332 IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001),
2344 2333
2345 IO_REPARSE_TAG_NSS = const_cpu_to_le32(0x68000005), 2334 IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005),
2346 IO_REPARSE_TAG_NSS_RECOVER = const_cpu_to_le32(0x68000006), 2335 IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006),
2347 IO_REPARSE_TAG_SIS = const_cpu_to_le32(0x68000007), 2336 IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007),
2348 IO_REPARSE_TAG_DFS = const_cpu_to_le32(0x68000008), 2337 IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008),
2349 2338
2350 IO_REPARSE_TAG_MOUNT_POINT = const_cpu_to_le32(0x88000003), 2339 IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003),
2351 2340
2352 IO_REPARSE_TAG_HSM = const_cpu_to_le32(0xa8000004), 2341 IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004),
2353 2342
2354 IO_REPARSE_TAG_SYMBOLIC_LINK = const_cpu_to_le32(0xe8000000), 2343 IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000),
2355 2344
2356 IO_REPARSE_TAG_VALID_VALUES = const_cpu_to_le32(0xe000ffff), 2345 IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff),
2357}; 2346};
2358 2347
2359/* 2348/*
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 9468e1c45ae..b5a6f08bd35 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -104,7 +104,7 @@ typedef struct {
104 * in this particular client array. Also inside the client records themselves, 104 * in this particular client array. Also inside the client records themselves,
105 * this means that there are no client records preceding or following this one. 105 * this means that there are no client records preceding or following this one.
106 */ 106 */
107#define LOGFILE_NO_CLIENT const_cpu_to_le16(0xffff) 107#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff)
108#define LOGFILE_NO_CLIENT_CPU 0xffff 108#define LOGFILE_NO_CLIENT_CPU 0xffff
109 109
110/* 110/*
@@ -112,8 +112,8 @@ typedef struct {
112 * information about the log file in which they are present. 112 * information about the log file in which they are present.
113 */ 113 */
114enum { 114enum {
115 RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002), 115 RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002),
116 RESTART_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ 116 RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
117} __attribute__ ((__packed__)); 117} __attribute__ ((__packed__));
118 118
119typedef le16 RESTART_AREA_FLAGS; 119typedef le16 RESTART_AREA_FLAGS;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 17d32ca6bc3..23bf68453d7 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
2839 */ 2839 */
2840 2840
2841 /* Mark the mft record as not in use. */ 2841 /* Mark the mft record as not in use. */
2842 m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE)); 2842 m->flags &= ~MFT_RECORD_IN_USE;
2843 2843
2844 /* Increment the sequence number, skipping zero, if it is not zero. */ 2844 /* Increment the sequence number, skipping zero, if it is not zero. */
2845 old_seq_no = m->sequence_number; 2845 old_seq_no = m->sequence_number;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4a46743b507..f76951dcd4a 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb,
618 * many BIOSes will refuse to boot from a bootsector if the magic is 618 * many BIOSes will refuse to boot from a bootsector if the magic is
619 * incorrect, so we emit a warning. 619 * incorrect, so we emit a warning.
620 */ 620 */
621 if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55)) 621 if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
622 ntfs_warning(sb, "Invalid end of sector marker."); 622 ntfs_warning(sb, "Invalid end of sector marker.");
623 return true; 623 return true;
624not_ntfs: 624not_ntfs:
@@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
1242 u32 *kaddr, *kend; 1242 u32 *kaddr, *kend;
1243 ntfs_name *name = NULL; 1243 ntfs_name *name = NULL;
1244 int ret = 1; 1244 int ret = 1;
1245 static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'), 1245 static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
1246 const_cpu_to_le16('i'), const_cpu_to_le16('b'), 1246 cpu_to_le16('i'), cpu_to_le16('b'),
1247 const_cpu_to_le16('e'), const_cpu_to_le16('r'), 1247 cpu_to_le16('e'), cpu_to_le16('r'),
1248 const_cpu_to_le16('f'), const_cpu_to_le16('i'), 1248 cpu_to_le16('f'), cpu_to_le16('i'),
1249 const_cpu_to_le16('l'), const_cpu_to_le16('.'), 1249 cpu_to_le16('l'), cpu_to_le16('.'),
1250 const_cpu_to_le16('s'), const_cpu_to_le16('y'), 1250 cpu_to_le16('s'), cpu_to_le16('y'),
1251 const_cpu_to_le16('s'), 0 }; 1251 cpu_to_le16('s'), 0 };
1252 1252
1253 ntfs_debug("Entering."); 1253 ntfs_debug("Entering.");
1254 /* 1254 /*
@@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
1296 goto iput_out; 1296 goto iput_out;
1297 } 1297 }
1298 kaddr = (u32*)page_address(page); 1298 kaddr = (u32*)page_address(page);
1299 if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) { 1299 if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
1300 ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " 1300 ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is "
1301 "hibernated on the volume. This is the " 1301 "hibernated on the volume. This is the "
1302 "system volume."); 1302 "system volume.");
@@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol)
1337 MFT_REF mref; 1337 MFT_REF mref;
1338 struct inode *tmp_ino; 1338 struct inode *tmp_ino;
1339 ntfs_name *name = NULL; 1339 ntfs_name *name = NULL;
1340 static const ntfschar Quota[7] = { const_cpu_to_le16('$'), 1340 static const ntfschar Quota[7] = { cpu_to_le16('$'),
1341 const_cpu_to_le16('Q'), const_cpu_to_le16('u'), 1341 cpu_to_le16('Q'), cpu_to_le16('u'),
1342 const_cpu_to_le16('o'), const_cpu_to_le16('t'), 1342 cpu_to_le16('o'), cpu_to_le16('t'),
1343 const_cpu_to_le16('a'), 0 }; 1343 cpu_to_le16('a'), 0 };
1344 static ntfschar Q[3] = { const_cpu_to_le16('$'), 1344 static ntfschar Q[3] = { cpu_to_le16('$'),
1345 const_cpu_to_le16('Q'), 0 }; 1345 cpu_to_le16('Q'), 0 };
1346 1346
1347 ntfs_debug("Entering."); 1347 ntfs_debug("Entering.");
1348 /* 1348 /*
@@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
1416 struct page *page; 1416 struct page *page;
1417 ntfs_name *name = NULL; 1417 ntfs_name *name = NULL;
1418 USN_HEADER *uh; 1418 USN_HEADER *uh;
1419 static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'), 1419 static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
1420 const_cpu_to_le16('U'), const_cpu_to_le16('s'), 1420 cpu_to_le16('U'), cpu_to_le16('s'),
1421 const_cpu_to_le16('n'), const_cpu_to_le16('J'), 1421 cpu_to_le16('n'), cpu_to_le16('J'),
1422 const_cpu_to_le16('r'), const_cpu_to_le16('n'), 1422 cpu_to_le16('r'), cpu_to_le16('n'),
1423 const_cpu_to_le16('l'), 0 }; 1423 cpu_to_le16('l'), 0 };
1424 static ntfschar Max[5] = { const_cpu_to_le16('$'), 1424 static ntfschar Max[5] = { cpu_to_le16('$'),
1425 const_cpu_to_le16('M'), const_cpu_to_le16('a'), 1425 cpu_to_le16('M'), cpu_to_le16('a'),
1426 const_cpu_to_le16('x'), 0 }; 1426 cpu_to_le16('x'), 0 };
1427 static ntfschar J[3] = { const_cpu_to_le16('$'), 1427 static ntfschar J[3] = { cpu_to_le16('$'),
1428 const_cpu_to_le16('J'), 0 }; 1428 cpu_to_le16('J'), 0 };
1429 1429
1430 ntfs_debug("Entering."); 1430 ntfs_debug("Entering.");
1431 /* 1431 /*
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 4087fbdac32..00d8e6bd7c3 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -116,27 +116,27 @@ typedef struct {
116 * documentation: http://www.linux-ntfs.org/ 116 * documentation: http://www.linux-ntfs.org/
117 */ 117 */
118enum { 118enum {
119 USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), 119 USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001),
120 USN_REASON_DATA_EXTEND = const_cpu_to_le32(0x00000002), 120 USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002),
121 USN_REASON_DATA_TRUNCATION = const_cpu_to_le32(0x00000004), 121 USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004),
122 USN_REASON_NAMED_DATA_OVERWRITE = const_cpu_to_le32(0x00000010), 122 USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010),
123 USN_REASON_NAMED_DATA_EXTEND = const_cpu_to_le32(0x00000020), 123 USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020),
124 USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040), 124 USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
125 USN_REASON_FILE_CREATE = const_cpu_to_le32(0x00000100), 125 USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100),
126 USN_REASON_FILE_DELETE = const_cpu_to_le32(0x00000200), 126 USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200),
127 USN_REASON_EA_CHANGE = const_cpu_to_le32(0x00000400), 127 USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400),
128 USN_REASON_SECURITY_CHANGE = const_cpu_to_le32(0x00000800), 128 USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800),
129 USN_REASON_RENAME_OLD_NAME = const_cpu_to_le32(0x00001000), 129 USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000),
130 USN_REASON_RENAME_NEW_NAME = const_cpu_to_le32(0x00002000), 130 USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000),
131 USN_REASON_INDEXABLE_CHANGE = const_cpu_to_le32(0x00004000), 131 USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000),
132 USN_REASON_BASIC_INFO_CHANGE = const_cpu_to_le32(0x00008000), 132 USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000),
133 USN_REASON_HARD_LINK_CHANGE = const_cpu_to_le32(0x00010000), 133 USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000),
134 USN_REASON_COMPRESSION_CHANGE = const_cpu_to_le32(0x00020000), 134 USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000),
135 USN_REASON_ENCRYPTION_CHANGE = const_cpu_to_le32(0x00040000), 135 USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000),
136 USN_REASON_OBJECT_ID_CHANGE = const_cpu_to_le32(0x00080000), 136 USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000),
137 USN_REASON_REPARSE_POINT_CHANGE = const_cpu_to_le32(0x00100000), 137 USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000),
138 USN_REASON_STREAM_CHANGE = const_cpu_to_le32(0x00200000), 138 USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000),
139 USN_REASON_CLOSE = const_cpu_to_le32(0x80000000), 139 USN_REASON_CLOSE = cpu_to_le32(0x80000000),
140}; 140};
141 141
142typedef le32 USN_REASON_FLAGS; 142typedef le32 USN_REASON_FLAGS;
@@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS;
148 * http://www.linux-ntfs.org/ 148 * http://www.linux-ntfs.org/
149 */ 149 */
150enum { 150enum {
151 USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), 151 USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001),
152 USN_SOURCE_AUXILIARY_DATA = const_cpu_to_le32(0x00000002), 152 USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002),
153 USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004), 153 USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
154}; 154};
155 155
156typedef le32 USN_SOURCE_INFO_FLAGS; 156typedef le32 USN_SOURCE_INFO_FLAGS;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713e..b606496b72e 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
154 return ret; 154 return ret;
155} 155}
156 156
157static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) 157static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
158{ 158{
159 struct page *page = vmf->page;
159 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 160 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
160 struct buffer_head *di_bh = NULL; 161 struct buffer_head *di_bh = NULL;
161 sigset_t blocked, oldset; 162 sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
196 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 197 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
197 if (ret2 < 0) 198 if (ret2 < 0)
198 mlog_errno(ret2); 199 mlog_errno(ret2);
199 200 if (ret)
201 ret = VM_FAULT_SIGBUS;
200 return ret; 202 return ret;
201} 203}
202 204
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 4a9e0f65ae6..83adcc86943 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,16 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
144{ 144{
145 struct proc_dir_entry *ent; 145 struct proc_dir_entry *ent;
146 146
147 if (!driver->ops->read_proc || !driver->driver_name || 147 if (!driver->driver_name || driver->proc_entry ||
148 driver->proc_entry) 148 !driver->ops->proc_fops)
149 return; 149 return;
150 150
151 ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver); 151 ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
152 if (!ent) 152 driver->ops->proc_fops, driver);
153 return;
154 ent->read_proc = driver->ops->read_proc;
155 ent->data = driver;
156
157 driver->proc_entry = ent; 153 driver->proc_entry = ent;
158} 154}
159 155
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 995ef1d6686..ebb2c417912 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -59,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = {
59 */ 59 */
60int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) 60int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
61{ 61{
62 struct pagevec lru_pvec;
63 unsigned long npages, xpages, loop, limit; 62 unsigned long npages, xpages, loop, limit;
64 struct page *pages; 63 struct page *pages;
65 unsigned order; 64 unsigned order;
@@ -102,24 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
102 memset(data, 0, newsize); 101 memset(data, 0, newsize);
103 102
104 /* attach all the pages to the inode's address space */ 103 /* attach all the pages to the inode's address space */
105 pagevec_init(&lru_pvec, 0);
106 for (loop = 0; loop < npages; loop++) { 104 for (loop = 0; loop < npages; loop++) {
107 struct page *page = pages + loop; 105 struct page *page = pages + loop;
108 106
109 ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL); 107 ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
108 GFP_KERNEL);
110 if (ret < 0) 109 if (ret < 0)
111 goto add_error; 110 goto add_error;
112 111
113 if (!pagevec_add(&lru_pvec, page))
114 __pagevec_lru_add_file(&lru_pvec);
115
116 /* prevent the page from being discarded on memory pressure */ 112 /* prevent the page from being discarded on memory pressure */
117 SetPageDirty(page); 113 SetPageDirty(page);
118 114
119 unlock_page(page); 115 unlock_page(page);
120 } 116 }
121 117
122 pagevec_lru_add_file(&lru_pvec);
123 return 0; 118 return 0;
124 119
125 fsize_exceeded: 120 fsize_exceeded:
@@ -128,10 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
128 return -EFBIG; 123 return -EFBIG;
129 124
130 add_error: 125 add_error:
131 pagevec_lru_add_file(&lru_pvec); 126 while (loop < npages)
132 page_cache_release(pages + loop); 127 __free_page(pages + loop++);
133 for (loop++; loop < npages; loop++)
134 __free_page(pages + loop);
135 return ret; 128 return ret;
136} 129}
137 130
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b7e6ac706b8..a404fb88e45 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -33,12 +33,15 @@
33#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
34#include <linux/ramfs.h> 34#include <linux/ramfs.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include "internal.h" 38#include "internal.h"
38 39
39/* some random number */ 40/* some random number */
40#define RAMFS_MAGIC 0x858458f6 41#define RAMFS_MAGIC 0x858458f6
41 42
43#define RAMFS_DEFAULT_MODE 0755
44
42static const struct super_operations ramfs_ops; 45static const struct super_operations ramfs_ops;
43static const struct inode_operations ramfs_dir_inode_operations; 46static const struct inode_operations ramfs_dir_inode_operations;
44 47
@@ -158,12 +161,75 @@ static const struct inode_operations ramfs_dir_inode_operations = {
158static const struct super_operations ramfs_ops = { 161static const struct super_operations ramfs_ops = {
159 .statfs = simple_statfs, 162 .statfs = simple_statfs,
160 .drop_inode = generic_delete_inode, 163 .drop_inode = generic_delete_inode,
164 .show_options = generic_show_options,
165};
166
167struct ramfs_mount_opts {
168 umode_t mode;
169};
170
171enum {
172 Opt_mode,
173 Opt_err
174};
175
176static const match_table_t tokens = {
177 {Opt_mode, "mode=%o"},
178 {Opt_err, NULL}
179};
180
181struct ramfs_fs_info {
182 struct ramfs_mount_opts mount_opts;
161}; 183};
162 184
185static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
186{
187 substring_t args[MAX_OPT_ARGS];
188 int option;
189 int token;
190 char *p;
191
192 opts->mode = RAMFS_DEFAULT_MODE;
193
194 while ((p = strsep(&data, ",")) != NULL) {
195 if (!*p)
196 continue;
197
198 token = match_token(p, tokens, args);
199 switch (token) {
200 case Opt_mode:
201 if (match_octal(&args[0], &option))
202 return -EINVAL;
203 opts->mode = option & S_IALLUGO;
204 break;
205 default:
206 printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
207 return -EINVAL;
208 }
209 }
210
211 return 0;
212}
213
163static int ramfs_fill_super(struct super_block * sb, void * data, int silent) 214static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
164{ 215{
165 struct inode * inode; 216 struct ramfs_fs_info *fsi;
166 struct dentry * root; 217 struct inode *inode = NULL;
218 struct dentry *root;
219 int err;
220
221 save_mount_options(sb, data);
222
223 fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
224 if (!fsi) {
225 err = -ENOMEM;
226 goto fail;
227 }
228 sb->s_fs_info = fsi;
229
230 err = ramfs_parse_options(data, &fsi->mount_opts);
231 if (err)
232 goto fail;
167 233
168 sb->s_maxbytes = MAX_LFS_FILESIZE; 234 sb->s_maxbytes = MAX_LFS_FILESIZE;
169 sb->s_blocksize = PAGE_CACHE_SIZE; 235 sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -171,17 +237,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
171 sb->s_magic = RAMFS_MAGIC; 237 sb->s_magic = RAMFS_MAGIC;
172 sb->s_op = &ramfs_ops; 238 sb->s_op = &ramfs_ops;
173 sb->s_time_gran = 1; 239 sb->s_time_gran = 1;
174 inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0); 240 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
175 if (!inode) 241 if (!inode) {
176 return -ENOMEM; 242 err = -ENOMEM;
243 goto fail;
244 }
177 245
178 root = d_alloc_root(inode); 246 root = d_alloc_root(inode);
179 if (!root) { 247 if (!root) {
180 iput(inode); 248 err = -ENOMEM;
181 return -ENOMEM; 249 goto fail;
182 } 250 }
183 sb->s_root = root; 251 sb->s_root = root;
184 return 0; 252 return 0;
253fail:
254 kfree(fsi);
255 iput(inode);
256 return err;
185} 257}
186 258
187int ramfs_get_sb(struct file_system_type *fs_type, 259int ramfs_get_sb(struct file_system_type *fs_type,
@@ -197,10 +269,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type,
197 mnt); 269 mnt);
198} 270}
199 271
272static void ramfs_kill_sb(struct super_block *sb)
273{
274 kfree(sb->s_fs_info);
275 kill_litter_super(sb);
276}
277
200static struct file_system_type ramfs_fs_type = { 278static struct file_system_type ramfs_fs_type = {
201 .name = "ramfs", 279 .name = "ramfs",
202 .get_sb = ramfs_get_sb, 280 .get_sb = ramfs_get_sb,
203 .kill_sb = kill_litter_super, 281 .kill_sb = ramfs_kill_sb,
204}; 282};
205static struct file_system_type rootfs_fs_type = { 283static struct file_system_type rootfs_fs_type = {
206 .name = "rootfs", 284 .name = "rootfs",
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 07703d3ff4a..93e0c0281d4 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -234,7 +234,7 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
234 return ret; 234 return ret;
235} 235}
236 236
237static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page) 237static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
238{ 238{
239 struct file *file = vma->vm_file; 239 struct file *file = vma->vm_file;
240 struct bin_buffer *bb = file->private_data; 240 struct bin_buffer *bb = file->private_data;
@@ -242,15 +242,15 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
242 int ret; 242 int ret;
243 243
244 if (!bb->vm_ops) 244 if (!bb->vm_ops)
245 return -EINVAL; 245 return VM_FAULT_SIGBUS;
246 246
247 if (!bb->vm_ops->page_mkwrite) 247 if (!bb->vm_ops->page_mkwrite)
248 return 0; 248 return 0;
249 249
250 if (!sysfs_get_active_two(attr_sd)) 250 if (!sysfs_get_active_two(attr_sd))
251 return -EINVAL; 251 return VM_FAULT_SIGBUS;
252 252
253 ret = bb->vm_ops->page_mkwrite(vma, page); 253 ret = bb->vm_ops->page_mkwrite(vma, vmf);
254 254
255 sysfs_put_active_two(attr_sd); 255 sysfs_put_active_two(attr_sd);
256 return ret; 256 return ret;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f26..0ff89fe71e5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
1434 * mmap()d file has taken write protection fault and is being made 1434 * mmap()d file has taken write protection fault and is being made
1435 * writable. UBIFS must ensure page is budgeted for. 1435 * writable. UBIFS must ensure page is budgeted for.
1436 */ 1436 */
1437static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 1437static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1438{ 1438{
1439 struct page *page = vmf->page;
1439 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1440 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1440 struct ubifs_info *c = inode->i_sb->s_fs_info; 1441 struct ubifs_info *c = inode->i_sb->s_fs_info;
1441 struct timespec now = ubifs_current_time(inode); 1442 struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1447 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); 1448 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
1448 1449
1449 if (unlikely(c->ro_media)) 1450 if (unlikely(c->ro_media))
1450 return -EROFS; 1451 return VM_FAULT_SIGBUS; /* -EROFS */
1451 1452
1452 /* 1453 /*
1453 * We have not locked @page so far so we may budget for changing the 1454 * We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1480 if (err == -ENOSPC) 1481 if (err == -ENOSPC)
1481 ubifs_warn("out of space for mmapped file " 1482 ubifs_warn("out of space for mmapped file "
1482 "(inode number %lu)", inode->i_ino); 1483 "(inode number %lu)", inode->i_ino);
1483 return err; 1484 return VM_FAULT_SIGBUS;
1484 } 1485 }
1485 1486
1486 lock_page(page); 1487 lock_page(page);
@@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1520out_unlock: 1521out_unlock:
1521 unlock_page(page); 1522 unlock_page(page);
1522 ubifs_release_budget(c, &req); 1523 ubifs_release_budget(c, &req);
1524 if (err)
1525 err = VM_FAULT_SIGBUS;
1523 return err; 1526 return err;
1524} 1527}
1525 1528
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0..f4e25544157 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
234STATIC int 234STATIC int
235xfs_vm_page_mkwrite( 235xfs_vm_page_mkwrite(
236 struct vm_area_struct *vma, 236 struct vm_area_struct *vma,
237 struct page *page) 237 struct vm_fault *vmf)
238{ 238{
239 return block_page_mkwrite(vma, page, xfs_get_blocks); 239 return block_page_mkwrite(vma, vmf, xfs_get_blocks);
240} 240}
241 241
242const struct file_operations xfs_file_operations = { 242const struct file_operations xfs_file_operations = {