aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/adfs/adfs.h4
-rw-r--r--fs/adfs/dir.c10
-rw-r--r--fs/adfs/dir_f.c17
-rw-r--r--fs/adfs/dir_fplus.c17
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c4
-rw-r--r--fs/adfs/map.c2
-rw-r--r--fs/adfs/super.c4
-rw-r--r--fs/affs/affs.h1
-rw-r--r--fs/affs/dir.c2
-rw-r--r--fs/affs/file.c14
-rw-r--r--fs/affs/super.c54
-rw-r--r--fs/afs/mntpt.c2
-rw-r--r--fs/afs/super.c4
-rw-r--r--fs/autofs/dirhash.c5
-rw-r--r--fs/autofs4/autofs_i.h6
-rw-r--r--fs/autofs4/dev-ioctl.c195
-rw-r--r--fs/autofs4/expire.c15
-rw-r--r--fs/autofs4/root.c7
-rw-r--r--fs/befs/linuxvfs.c5
-rw-r--r--fs/bfs/dir.c8
-rw-r--r--fs/bfs/inode.c52
-rw-r--r--fs/bio.c29
-rw-r--r--fs/block_dev.c31
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/acl.c5
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/crc32c.h29
-rw-r--r--fs/btrfs/ctree.c698
-rw-r--r--fs/btrfs/ctree.h330
-rw-r--r--fs/btrfs/delayed-ref.c509
-rw-r--r--fs/btrfs/delayed-ref.h85
-rw-r--r--fs/btrfs/disk-io.c164
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c2610
-rw-r--r--fs/btrfs/extent_io.c18
-rw-r--r--fs/btrfs/file.c78
-rw-r--r--fs/btrfs/free-space-cache.c10
-rw-r--r--fs/btrfs/free-space-cache.h1
-rw-r--r--fs/btrfs/hash.h4
-rw-r--r--fs/btrfs/inode.c166
-rw-r--r--fs/btrfs/ioctl.c197
-rw-r--r--fs/btrfs/print-tree.c155
-rw-r--r--fs/btrfs/relocation.c3711
-rw-r--r--fs/btrfs/root-tree.c17
-rw-r--r--fs/btrfs/super.c64
-rw-r--r--fs/btrfs/transaction.c410
-rw-r--r--fs/btrfs/transaction.h12
-rw-r--r--fs/btrfs/tree-log.c103
-rw-r--r--fs/btrfs/volumes.c69
-rw-r--r--fs/btrfs/volumes.h12
-rw-r--r--fs/buffer.c8
-rw-r--r--fs/cachefiles/interface.c4
-rw-r--r--fs/char_dev.c14
-rw-r--r--fs/cifs/CHANGES11
-rw-r--r--fs/cifs/README16
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifsacl.c178
-rw-r--r--fs/cifs/cifsfs.c8
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsproto.h10
-rw-r--r--fs/cifs/cifssmb.c7
-rw-r--r--fs/cifs/connect.c34
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/inode.c19
-rw-r--r--fs/cifs/netmisc.c24
-rw-r--r--fs/cifs/readdir.c44
-rw-r--r--fs/coda/file.c9
-rw-r--r--fs/compat.c8
-rw-r--r--fs/configfs/configfs_internal.h3
-rw-r--r--fs/configfs/dir.c196
-rw-r--r--fs/configfs/inode.c38
-rw-r--r--fs/dcache.c7
-rw-r--r--fs/devpts/inode.c4
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/dir.c7
-rw-r--r--fs/dlm/lockspace.c17
-rw-r--r--fs/dlm/lowcomms.c22
-rw-r--r--fs/dlm/lowcomms.h3
-rw-r--r--fs/dlm/member.c19
-rw-r--r--fs/dlm/requestqueue.c2
-rw-r--r--fs/ecryptfs/super.c5
-rw-r--r--fs/eventfd.c3
-rw-r--r--fs/exec.c19
-rw-r--r--fs/exofs/common.h6
-rw-r--r--fs/exofs/inode.c8
-rw-r--r--fs/exofs/osd.c30
-rw-r--r--fs/exofs/super.c25
-rw-r--r--fs/ext2/Makefile2
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext2/fsync.c50
-rw-r--r--fs/ext2/inode.c11
-rw-r--r--fs/ext2/super.c61
-rw-r--r--fs/ext3/balloc.c3
-rw-r--r--fs/ext3/ialloc.c3
-rw-r--r--fs/ext3/inode.c1
-rw-r--r--fs/ext3/resize.c2
-rw-r--r--fs/ext3/super.c39
-rw-r--r--fs/ext3/xattr.c1
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/balloc.c28
-rw-r--r--fs/ext4/block_validity.c244
-rw-r--r--fs/ext4/dir.c3
-rw-r--r--fs/ext4/ext4.h354
-rw-r--r--fs/ext4/ext4_i.h140
-rw-r--r--fs/ext4/ext4_sb.h161
-rw-r--r--fs/ext4/extents.c85
-rw-r--r--fs/ext4/group.h29
-rw-r--r--fs/ext4/ialloc.c73
-rw-r--r--fs/ext4/inode.c593
-rw-r--r--fs/ext4/mballoc.c166
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/namei.c27
-rw-r--r--fs/ext4/namei.h8
-rw-r--r--fs/ext4/resize.c36
-rw-r--r--fs/ext4/super.c849
-rw-r--r--fs/fat/dir.c16
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/fatent.c13
-rw-r--r--fs/fat/file.c14
-rw-r--r--fs/fat/inode.c31
-rw-r--r--fs/fat/namei_msdos.c4
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/file_table.c40
-rw-r--r--fs/freevxfs/vxfs_super.c4
-rw-r--r--fs/fs-writeback.c92
-rw-r--r--fs/fuse/Makefile1
-rw-r--r--fs/fuse/cuse.c610
-rw-r--r--fs/fuse/dev.c15
-rw-r--r--fs/fuse/dir.c33
-rw-r--r--fs/fuse/file.c346
-rw-r--r--fs/fuse/fuse_i.h47
-rw-r--r--fs/fuse/inode.c118
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/Makefile5
-rw-r--r--fs/gfs2/aops.c (renamed from fs/gfs2/ops_address.c)21
-rw-r--r--fs/gfs2/bmap.c15
-rw-r--r--fs/gfs2/dentry.c (renamed from fs/gfs2/ops_dentry.c)0
-rw-r--r--fs/gfs2/dir.c11
-rw-r--r--fs/gfs2/eattr.c14
-rw-r--r--fs/gfs2/export.c (renamed from fs/gfs2/ops_export.c)0
-rw-r--r--fs/gfs2/file.c (renamed from fs/gfs2/ops_file.c)36
-rw-r--r--fs/gfs2/glock.c33
-rw-r--r--fs/gfs2/glops.c20
-rw-r--r--fs/gfs2/incore.h27
-rw-r--r--fs/gfs2/inode.c150
-rw-r--r--fs/gfs2/inode.h52
-rw-r--r--fs/gfs2/log.c17
-rw-r--r--fs/gfs2/lops.c17
-rw-r--r--fs/gfs2/main.c8
-rw-r--r--fs/gfs2/meta_io.c105
-rw-r--r--fs/gfs2/mount.c185
-rw-r--r--fs/gfs2/ops_address.h23
-rw-r--r--fs/gfs2/ops_fstype.c74
-rw-r--r--fs/gfs2/ops_inode.c146
-rw-r--r--fs/gfs2/ops_super.c723
-rw-r--r--fs/gfs2/quota.c1
-rw-r--r--fs/gfs2/recovery.c102
-rw-r--r--fs/gfs2/recovery.h2
-rw-r--r--fs/gfs2/rgrp.c152
-rw-r--r--fs/gfs2/rgrp.h47
-rw-r--r--fs/gfs2/super.c890
-rw-r--r--fs/gfs2/sys.c245
-rw-r--r--fs/gfs2/trace_gfs2.h407
-rw-r--r--fs/gfs2/trans.c9
-rw-r--r--fs/hfs/super.c23
-rw-r--r--fs/hfsplus/super.c25
-rw-r--r--fs/hpfs/super.c12
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c12
-rw-r--r--fs/internal.h17
-rw-r--r--fs/ioctl.c14
-rw-r--r--fs/isofs/inode.c5
-rw-r--r--fs/jbd2/journal.c8
-rw-r--r--fs/jffs2/fs.c18
-rw-r--r--fs/jffs2/os-linux.h1
-rw-r--r--fs/jffs2/super.c26
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/super.c31
-rw-r--r--fs/libfs.c25
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/minix/file.c20
-rw-r--r--fs/minix/inode.c37
-rw-r--r--fs/minix/minix.h2
-rw-r--r--fs/mpage.c6
-rw-r--r--fs/namei.c135
-rw-r--r--fs/namespace.c327
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/export.c78
-rw-r--r--fs/nfsd/vfs.c68
-rw-r--r--fs/nilfs2/cpfile.c6
-rw-r--r--fs/nilfs2/sb.h1
-rw-r--r--fs/nilfs2/super.c256
-rw-r--r--fs/nilfs2/the_nilfs.c115
-rw-r--r--fs/nilfs2/the_nilfs.h23
-rw-r--r--fs/notify/Kconfig13
-rw-r--r--fs/notify/Makefile2
-rw-r--r--fs/notify/dnotify/Kconfig1
-rw-r--r--fs/notify/dnotify/dnotify.c464
-rw-r--r--fs/notify/fsnotify.c186
-rw-r--r--fs/notify/fsnotify.h34
-rw-r--r--fs/notify/group.c254
-rw-r--r--fs/notify/inode_mark.c426
-rw-r--r--fs/notify/inotify/Kconfig20
-rw-r--r--fs/notify/inotify/Makefile2
-rw-r--r--fs/notify/inotify/inotify.c20
-rw-r--r--fs/notify/inotify/inotify.h21
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c138
-rw-r--r--fs/notify/inotify/inotify_user.c837
-rw-r--r--fs/notify/notification.c411
-rw-r--r--fs/ntfs/super.c60
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/super.c24
-rw-r--r--fs/omfs/file.c17
-rw-r--r--fs/open.c4
-rw-r--r--fs/partitions/check.c52
-rw-r--r--fs/partitions/ibm.c2
-rw-r--r--fs/partitions/msdos.c4
-rw-r--r--fs/pipe.c14
-rw-r--r--fs/proc/base.c6
-rw-r--r--fs/proc/internal.h25
-rw-r--r--fs/proc/loadavg.c18
-rw-r--r--fs/proc/proc_devtree.c1
-rw-r--r--fs/qnx4/Makefile2
-rw-r--r--fs/qnx4/bitmap.c7
-rw-r--r--fs/qnx4/dir.c9
-rw-r--r--fs/qnx4/file.c5
-rw-r--r--fs/qnx4/fsync.c169
-rw-r--r--fs/qnx4/inode.c58
-rw-r--r--fs/qnx4/namei.c13
-rw-r--r--fs/qnx4/qnx4.h57
-rw-r--r--fs/qnx4/truncate.c6
-rw-r--r--fs/quota/quota.c25
-rw-r--r--fs/read_write.c7
-rw-r--r--fs/reiserfs/dir.c10
-rw-r--r--fs/reiserfs/super.c33
-rw-r--r--fs/reiserfs/xattr.c3
-rw-r--r--fs/smbfs/inode.c4
-rw-r--r--fs/splice.c338
-rw-r--r--fs/squashfs/super.c4
-rw-r--r--fs/super.c192
-rw-r--r--fs/sync.c117
-rw-r--r--fs/sysv/dir.c2
-rw-r--r--fs/sysv/file.c17
-rw-r--r--fs/sysv/inode.c75
-rw-r--r--fs/sysv/sysv.h1
-rw-r--r--fs/ubifs/super.c17
-rw-r--r--fs/udf/Makefile2
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/fsync.c52
-rw-r--r--fs/udf/super.c13
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c23
-rw-r--r--fs/ufs/super.c65
-rw-r--r--fs/ufs/ufs.h1
-rw-r--r--fs/xattr.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c12
-rw-r--r--fs/xfs/xfs_trans.c2
269 files changed, 16596 insertions, 7838 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 9f7270f36b2a..525da2e8f73b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -62,6 +62,16 @@ source "fs/autofs/Kconfig"
62source "fs/autofs4/Kconfig" 62source "fs/autofs4/Kconfig"
63source "fs/fuse/Kconfig" 63source "fs/fuse/Kconfig"
64 64
65config CUSE
66 tristate "Character device in Userpace support"
67 depends on FUSE_FS
68 help
69 This FUSE extension allows character devices to be
70 implemented in userspace.
71
72 If you want to develop or use userspace character device
73 based on CUSE, answer Y or M.
74
65config GENERIC_ACL 75config GENERIC_ACL
66 bool 76 bool
67 select FS_POSIX_ACL 77 select FS_POSIX_ACL
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index e0a85dbeeb88..a6665f37f456 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -53,6 +53,7 @@ struct adfs_dir_ops {
53 int (*update)(struct adfs_dir *dir, struct object_info *obj); 53 int (*update)(struct adfs_dir *dir, struct object_info *obj);
54 int (*create)(struct adfs_dir *dir, struct object_info *obj); 54 int (*create)(struct adfs_dir *dir, struct object_info *obj);
55 int (*remove)(struct adfs_dir *dir, struct object_info *obj); 55 int (*remove)(struct adfs_dir *dir, struct object_info *obj);
56 int (*sync)(struct adfs_dir *dir);
56 void (*free)(struct adfs_dir *dir); 57 void (*free)(struct adfs_dir *dir);
57}; 58};
58 59
@@ -90,7 +91,8 @@ extern const struct dentry_operations adfs_dentry_operations;
90extern struct adfs_dir_ops adfs_f_dir_ops; 91extern struct adfs_dir_ops adfs_f_dir_ops;
91extern struct adfs_dir_ops adfs_fplus_dir_ops; 92extern struct adfs_dir_ops adfs_fplus_dir_ops;
92 93
93extern int adfs_dir_update(struct super_block *sb, struct object_info *obj); 94extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
95 int wait);
94 96
95/* file.c */ 97/* file.c */
96extern const struct inode_operations adfs_file_inode_operations; 98extern const struct inode_operations adfs_file_inode_operations;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index e867ccf37246..4d4073447d1a 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -83,7 +83,7 @@ out:
83} 83}
84 84
85int 85int
86adfs_dir_update(struct super_block *sb, struct object_info *obj) 86adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
87{ 87{
88 int ret = -EINVAL; 88 int ret = -EINVAL;
89#ifdef CONFIG_ADFS_FS_RW 89#ifdef CONFIG_ADFS_FS_RW
@@ -106,6 +106,12 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj)
106 ret = ops->update(&dir, obj); 106 ret = ops->update(&dir, obj);
107 write_unlock(&adfs_dir_lock); 107 write_unlock(&adfs_dir_lock);
108 108
109 if (wait) {
110 int err = ops->sync(&dir);
111 if (!ret)
112 ret = err;
113 }
114
109 ops->free(&dir); 115 ops->free(&dir);
110out: 116out:
111#endif 117#endif
@@ -199,7 +205,7 @@ const struct file_operations adfs_dir_operations = {
199 .read = generic_read_dir, 205 .read = generic_read_dir,
200 .llseek = generic_file_llseek, 206 .llseek = generic_file_llseek,
201 .readdir = adfs_readdir, 207 .readdir = adfs_readdir,
202 .fsync = file_fsync, 208 .fsync = simple_fsync,
203}; 209};
204 210
205static int 211static int
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index ea7df2146921..31df6adf0de6 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -437,6 +437,22 @@ bad_dir:
437#endif 437#endif
438} 438}
439 439
440static int
441adfs_f_sync(struct adfs_dir *dir)
442{
443 int err = 0;
444 int i;
445
446 for (i = dir->nr_buffers - 1; i >= 0; i--) {
447 struct buffer_head *bh = dir->bh[i];
448 sync_dirty_buffer(bh);
449 if (buffer_req(bh) && !buffer_uptodate(bh))
450 err = -EIO;
451 }
452
453 return err;
454}
455
440static void 456static void
441adfs_f_free(struct adfs_dir *dir) 457adfs_f_free(struct adfs_dir *dir)
442{ 458{
@@ -456,5 +472,6 @@ struct adfs_dir_ops adfs_f_dir_ops = {
456 .setpos = adfs_f_setpos, 472 .setpos = adfs_f_setpos,
457 .getnext = adfs_f_getnext, 473 .getnext = adfs_f_getnext,
458 .update = adfs_f_update, 474 .update = adfs_f_update,
475 .sync = adfs_f_sync,
459 .free = adfs_f_free 476 .free = adfs_f_free
460}; 477};
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1ec644e32df9..139e0f345f18 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -161,6 +161,22 @@ out:
161 return ret; 161 return ret;
162} 162}
163 163
164static int
165adfs_fplus_sync(struct adfs_dir *dir)
166{
167 int err = 0;
168 int i;
169
170 for (i = dir->nr_buffers - 1; i >= 0; i--) {
171 struct buffer_head *bh = dir->bh[i];
172 sync_dirty_buffer(bh);
173 if (buffer_req(bh) && !buffer_uptodate(bh))
174 err = -EIO;
175 }
176
177 return err;
178}
179
164static void 180static void
165adfs_fplus_free(struct adfs_dir *dir) 181adfs_fplus_free(struct adfs_dir *dir)
166{ 182{
@@ -175,5 +191,6 @@ struct adfs_dir_ops adfs_fplus_dir_ops = {
175 .read = adfs_fplus_read, 191 .read = adfs_fplus_read,
176 .setpos = adfs_fplus_setpos, 192 .setpos = adfs_fplus_setpos,
177 .getnext = adfs_fplus_getnext, 193 .getnext = adfs_fplus_getnext,
194 .sync = adfs_fplus_sync,
178 .free = adfs_fplus_free 195 .free = adfs_fplus_free
179}; 196};
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 36e381c6a99a..8224d54a2afb 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -30,7 +30,7 @@ const struct file_operations adfs_file_operations = {
30 .read = do_sync_read, 30 .read = do_sync_read,
31 .aio_read = generic_file_aio_read, 31 .aio_read = generic_file_aio_read,
32 .mmap = generic_file_mmap, 32 .mmap = generic_file_mmap,
33 .fsync = file_fsync, 33 .fsync = simple_fsync,
34 .write = do_sync_write, 34 .write = do_sync_write,
35 .aio_write = generic_file_aio_write, 35 .aio_write = generic_file_aio_write,
36 .splice_read = generic_file_splice_read, 36 .splice_read = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e647200262a2..05b3a677201d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -376,7 +376,7 @@ out:
376 * The adfs-specific inode data has already been updated by 376 * The adfs-specific inode data has already been updated by
377 * adfs_notify_change() 377 * adfs_notify_change()
378 */ 378 */
379int adfs_write_inode(struct inode *inode, int unused) 379int adfs_write_inode(struct inode *inode, int wait)
380{ 380{
381 struct super_block *sb = inode->i_sb; 381 struct super_block *sb = inode->i_sb;
382 struct object_info obj; 382 struct object_info obj;
@@ -391,7 +391,7 @@ int adfs_write_inode(struct inode *inode, int unused)
391 obj.attr = ADFS_I(inode)->attr; 391 obj.attr = ADFS_I(inode)->attr;
392 obj.size = inode->i_size; 392 obj.size = inode->i_size;
393 393
394 ret = adfs_dir_update(sb, &obj); 394 ret = adfs_dir_update(sb, &obj, wait);
395 unlock_kernel(); 395 unlock_kernel();
396 return ret; 396 return ret;
397} 397}
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 92ab4fbc2031..568081b93f73 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -62,7 +62,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
62#define GET_FRAG_ID(_map,_start,_idmask) \ 62#define GET_FRAG_ID(_map,_start,_idmask) \
63 ({ \ 63 ({ \
64 unsigned char *_m = _map + (_start >> 3); \ 64 unsigned char *_m = _map + (_start >> 3); \
65 u32 _frag = get_unaligned((u32 *)_m); \ 65 u32 _frag = get_unaligned_le32(_m); \
66 _frag >>= (_start & 7); \ 66 _frag >>= (_start & 7); \
67 _frag & _idmask; \ 67 _frag & _idmask; \
68 }) 68 })
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index dd9becca4241..0ec5aaf47aa7 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -132,11 +132,15 @@ static void adfs_put_super(struct super_block *sb)
132 int i; 132 int i;
133 struct adfs_sb_info *asb = ADFS_SB(sb); 133 struct adfs_sb_info *asb = ADFS_SB(sb);
134 134
135 lock_kernel();
136
135 for (i = 0; i < asb->s_map_size; i++) 137 for (i = 0; i < asb->s_map_size; i++)
136 brelse(asb->s_map[i].dm_bh); 138 brelse(asb->s_map[i].dm_bh);
137 kfree(asb->s_map); 139 kfree(asb->s_map);
138 kfree(asb); 140 kfree(asb);
139 sb->s_fs_info = NULL; 141 sb->s_fs_info = NULL;
142
143 unlock_kernel();
140} 144}
141 145
142static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) 146static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 1a2d5e3c7f4e..e511dc621a2e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -182,6 +182,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent
182 182
183void affs_free_prealloc(struct inode *inode); 183void affs_free_prealloc(struct inode *inode);
184extern void affs_truncate(struct inode *); 184extern void affs_truncate(struct inode *);
185int affs_file_fsync(struct file *, struct dentry *, int);
185 186
186/* dir.c */ 187/* dir.c */
187 188
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 7b36904dbeac..8ca8f3a55599 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = {
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .readdir = affs_readdir, 23 .readdir = affs_readdir,
24 .fsync = file_fsync, 24 .fsync = affs_file_fsync,
25}; 25};
26 26
27/* 27/*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 9246cb4aa018..184e55c1c9ba 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = {
34 .mmap = generic_file_mmap, 34 .mmap = generic_file_mmap,
35 .open = affs_file_open, 35 .open = affs_file_open,
36 .release = affs_file_release, 36 .release = affs_file_release,
37 .fsync = file_fsync, 37 .fsync = affs_file_fsync,
38 .splice_read = generic_file_splice_read, 38 .splice_read = generic_file_splice_read,
39}; 39};
40 40
@@ -915,3 +915,15 @@ affs_truncate(struct inode *inode)
915 } 915 }
916 affs_free_prealloc(inode); 916 affs_free_prealloc(inode);
917} 917}
918
919int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
920{
921 struct inode * inode = dentry->d_inode;
922 int ret, err;
923
924 ret = write_inode_now(inode, 0);
925 err = sync_blockdev(inode->i_sb->s_bdev);
926 if (!ret)
927 ret = err;
928 return ret;
929}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 63f5183f263b..104fdcb3a7fc 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
16#include <linux/parser.h> 16#include <linux/parser.h>
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h>
19#include "affs.h" 20#include "affs.h"
20 21
21extern struct timezone sys_tz; 22extern struct timezone sys_tz;
@@ -24,49 +25,67 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
24static int affs_remount (struct super_block *sb, int *flags, char *data); 25static int affs_remount (struct super_block *sb, int *flags, char *data);
25 26
26static void 27static void
28affs_commit_super(struct super_block *sb, int clean)
29{
30 struct affs_sb_info *sbi = AFFS_SB(sb);
31 struct buffer_head *bh = sbi->s_root_bh;
32 struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
33
34 tail->bm_flag = cpu_to_be32(clean);
35 secs_to_datestamp(get_seconds(), &tail->disk_change);
36 affs_fix_checksum(sb, bh);
37 mark_buffer_dirty(bh);
38}
39
40static void
27affs_put_super(struct super_block *sb) 41affs_put_super(struct super_block *sb)
28{ 42{
29 struct affs_sb_info *sbi = AFFS_SB(sb); 43 struct affs_sb_info *sbi = AFFS_SB(sb);
30 pr_debug("AFFS: put_super()\n"); 44 pr_debug("AFFS: put_super()\n");
31 45
32 if (!(sb->s_flags & MS_RDONLY)) { 46 lock_kernel();
33 AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1); 47
34 secs_to_datestamp(get_seconds(), 48 if (!(sb->s_flags & MS_RDONLY))
35 &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change); 49 affs_commit_super(sb, 1);
36 affs_fix_checksum(sb, sbi->s_root_bh);
37 mark_buffer_dirty(sbi->s_root_bh);
38 }
39 50
40 kfree(sbi->s_prefix); 51 kfree(sbi->s_prefix);
41 affs_free_bitmap(sb); 52 affs_free_bitmap(sb);
42 affs_brelse(sbi->s_root_bh); 53 affs_brelse(sbi->s_root_bh);
43 kfree(sbi); 54 kfree(sbi);
44 sb->s_fs_info = NULL; 55 sb->s_fs_info = NULL;
45 return; 56
57 unlock_kernel();
46} 58}
47 59
48static void 60static void
49affs_write_super(struct super_block *sb) 61affs_write_super(struct super_block *sb)
50{ 62{
51 int clean = 2; 63 int clean = 2;
52 struct affs_sb_info *sbi = AFFS_SB(sb);
53 64
65 lock_super(sb);
54 if (!(sb->s_flags & MS_RDONLY)) { 66 if (!(sb->s_flags & MS_RDONLY)) {
55 // if (sbi->s_bitmap[i].bm_bh) { 67 // if (sbi->s_bitmap[i].bm_bh) {
56 // if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) { 68 // if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
57 // clean = 0; 69 // clean = 0;
58 AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean); 70 affs_commit_super(sb, clean);
59 secs_to_datestamp(get_seconds(),
60 &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
61 affs_fix_checksum(sb, sbi->s_root_bh);
62 mark_buffer_dirty(sbi->s_root_bh);
63 sb->s_dirt = !clean; /* redo until bitmap synced */ 71 sb->s_dirt = !clean; /* redo until bitmap synced */
64 } else 72 } else
65 sb->s_dirt = 0; 73 sb->s_dirt = 0;
74 unlock_super(sb);
66 75
67 pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean); 76 pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
68} 77}
69 78
79static int
80affs_sync_fs(struct super_block *sb, int wait)
81{
82 lock_super(sb);
83 affs_commit_super(sb, 2);
84 sb->s_dirt = 0;
85 unlock_super(sb);
86 return 0;
87}
88
70static struct kmem_cache * affs_inode_cachep; 89static struct kmem_cache * affs_inode_cachep;
71 90
72static struct inode *affs_alloc_inode(struct super_block *sb) 91static struct inode *affs_alloc_inode(struct super_block *sb)
@@ -124,6 +143,7 @@ static const struct super_operations affs_sops = {
124 .clear_inode = affs_clear_inode, 143 .clear_inode = affs_clear_inode,
125 .put_super = affs_put_super, 144 .put_super = affs_put_super,
126 .write_super = affs_write_super, 145 .write_super = affs_write_super,
146 .sync_fs = affs_sync_fs,
127 .statfs = affs_statfs, 147 .statfs = affs_statfs,
128 .remount_fs = affs_remount, 148 .remount_fs = affs_remount,
129 .show_options = generic_show_options, 149 .show_options = generic_show_options,
@@ -507,6 +527,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
507 kfree(new_opts); 527 kfree(new_opts);
508 return -EINVAL; 528 return -EINVAL;
509 } 529 }
530 lock_kernel();
510 replace_mount_options(sb, new_opts); 531 replace_mount_options(sb, new_opts);
511 532
512 sbi->s_flags = mount_flags; 533 sbi->s_flags = mount_flags;
@@ -514,8 +535,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
514 sbi->s_uid = uid; 535 sbi->s_uid = uid;
515 sbi->s_gid = gid; 536 sbi->s_gid = gid;
516 537
517 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 538 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
539 unlock_kernel();
518 return 0; 540 return 0;
541 }
519 if (*flags & MS_RDONLY) { 542 if (*flags & MS_RDONLY) {
520 sb->s_dirt = 1; 543 sb->s_dirt = 1;
521 while (sb->s_dirt) 544 while (sb->s_dirt)
@@ -524,6 +547,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
524 } else 547 } else
525 res = affs_init_bitmap(sb, flags); 548 res = affs_init_bitmap(sb, flags);
526 549
550 unlock_kernel();
527 return res; 551 return res;
528} 552}
529 553
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2b9e2d03a390..c52be53f6946 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -244,7 +244,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
244 case -EBUSY: 244 case -EBUSY:
245 /* someone else made a mount here whilst we were busy */ 245 /* someone else made a mount here whilst we were busy */
246 while (d_mountpoint(nd->path.dentry) && 246 while (d_mountpoint(nd->path.dentry) &&
247 follow_down(&nd->path.mnt, &nd->path.dentry)) 247 follow_down(&nd->path))
248 ; 248 ;
249 err = 0; 249 err = 0;
250 default: 250 default:
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 76828e5f8a39..ad0514d0115f 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -440,8 +440,12 @@ static void afs_put_super(struct super_block *sb)
440 440
441 _enter(""); 441 _enter("");
442 442
443 lock_kernel();
444
443 afs_put_volume(as->volume); 445 afs_put_volume(as->volume);
444 446
447 unlock_kernel();
448
445 _leave(""); 449 _leave("");
446} 450}
447 451
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 4eb4d8dfb2f1..2316e944a109 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -85,13 +85,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
85 } 85 }
86 path.mnt = mnt; 86 path.mnt = mnt;
87 path_get(&path); 87 path_get(&path);
88 if (!follow_down(&path.mnt, &path.dentry)) { 88 if (!follow_down(&path)) {
89 path_put(&path); 89 path_put(&path);
90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); 90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
91 continue; 91 continue;
92 } 92 }
93 while (d_mountpoint(path.dentry) && 93 while (d_mountpoint(path.dentry) && follow_down(&path));
94 follow_down(&path.mnt, &path.dentry))
95 ; 94 ;
96 umount_ok = may_umount(path.mnt); 95 umount_ok = may_umount(path.mnt);
97 path_put(&path); 96 path_put(&path);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b7ff33c63101..8f7cdde41733 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
223int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); 223int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
224void autofs4_catatonic_mode(struct autofs_sb_info *); 224void autofs4_catatonic_mode(struct autofs_sb_info *);
225 225
226static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry) 226static inline int autofs4_follow_mount(struct path *path)
227{ 227{
228 int res = 0; 228 int res = 0;
229 229
230 while (d_mountpoint(*dentry)) { 230 while (d_mountpoint(path->dentry)) {
231 int followed = follow_down(mnt, dentry); 231 int followed = follow_down(path);
232 if (!followed) 232 if (!followed)
233 break; 233 break;
234 res = 1; 234 res = 1;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 84168c0dcc2d..f3da2eb51f56 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -192,77 +192,42 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
192 return 0; 192 return 0;
193} 193}
194 194
195/* 195static int find_autofs_mount(const char *pathname,
196 * Walk down the mount stack looking for an autofs mount that 196 struct path *res,
197 * has the requested device number (aka. new_encode_dev(sb->s_dev). 197 int test(struct path *path, void *data),
198 */ 198 void *data)
199static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
200{ 199{
201 struct dentry *dentry; 200 struct path path;
202 struct inode *inode; 201 int err = kern_path(pathname, 0, &path);
203 struct super_block *sb; 202 if (err)
204 dev_t s_dev; 203 return err;
205 unsigned int err;
206
207 err = -ENOENT; 204 err = -ENOENT;
208 205 while (path.dentry == path.mnt->mnt_root) {
209 /* Lookup the dentry name at the base of our mount point */ 206 if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
210 dentry = d_lookup(nd->path.dentry, &nd->last); 207 if (test(&path, data)) {
211 if (!dentry) 208 path_get(&path);
212 goto out; 209 if (!err) /* already found some */
213 210 path_put(res);
214 dput(nd->path.dentry); 211 *res = path;
215 nd->path.dentry = dentry;
216
217 /* And follow the mount stack looking for our autofs mount */
218 while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
219 inode = nd->path.dentry->d_inode;
220 if (!inode)
221 break;
222
223 sb = inode->i_sb;
224 s_dev = new_encode_dev(sb->s_dev);
225 if (devno == s_dev) {
226 if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
227 err = 0; 212 err = 0;
228 break;
229 } 213 }
230 } 214 }
215 if (!follow_up(&path))
216 break;
231 } 217 }
232out: 218 path_put(&path);
233 return err; 219 return err;
234} 220}
235 221
236/* 222static int test_by_dev(struct path *path, void *p)
237 * Walk down the mount stack looking for an autofs mount that
238 * has the requested mount type (ie. indirect, direct or offset).
239 */
240static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
241{ 223{
242 struct dentry *dentry; 224 return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
243 struct autofs_info *ino; 225}
244 unsigned int err;
245
246 err = -ENOENT;
247
248 /* Lookup the dentry name at the base of our mount point */
249 dentry = d_lookup(nd->path.dentry, &nd->last);
250 if (!dentry)
251 goto out;
252
253 dput(nd->path.dentry);
254 nd->path.dentry = dentry;
255 226
256 /* And follow the mount stack looking for our autofs mount */ 227static int test_by_type(struct path *path, void *p)
257 while (follow_down(&nd->path.mnt, &nd->path.dentry)) { 228{
258 ino = autofs4_dentry_ino(nd->path.dentry); 229 struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
259 if (ino && ino->sbi->type & type) { 230 return ino && ino->sbi->type & *(unsigned *)p;
260 err = 0;
261 break;
262 }
263 }
264out:
265 return err;
266} 231}
267 232
268static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) 233static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
@@ -283,31 +248,25 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
283 * Open a file descriptor on the autofs mount point corresponding 248 * Open a file descriptor on the autofs mount point corresponding
284 * to the given path and device number (aka. new_encode_dev(sb->s_dev)). 249 * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
285 */ 250 */
286static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid) 251static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
287{ 252{
288 struct file *filp;
289 struct nameidata nd;
290 int err, fd; 253 int err, fd;
291 254
292 fd = get_unused_fd(); 255 fd = get_unused_fd();
293 if (likely(fd >= 0)) { 256 if (likely(fd >= 0)) {
294 /* Get nameidata of the parent directory */ 257 struct file *filp;
295 err = path_lookup(path, LOOKUP_PARENT, &nd); 258 struct path path;
259
260 err = find_autofs_mount(name, &path, test_by_dev, &devid);
296 if (err) 261 if (err)
297 goto out; 262 goto out;
298 263
299 /* 264 /*
300 * Search down, within the parent, looking for an 265 * Find autofs super block that has the device number
301 * autofs super block that has the device number
302 * corresponding to the autofs fs we want to open. 266 * corresponding to the autofs fs we want to open.
303 */ 267 */
304 err = autofs_dev_ioctl_find_super(&nd, devid);
305 if (err) {
306 path_put(&nd.path);
307 goto out;
308 }
309 268
310 filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY, 269 filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
311 current_cred()); 270 current_cred());
312 if (IS_ERR(filp)) { 271 if (IS_ERR(filp)) {
313 err = PTR_ERR(filp); 272 err = PTR_ERR(filp);
@@ -340,7 +299,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
340 param->ioctlfd = -1; 299 param->ioctlfd = -1;
341 300
342 path = param->path; 301 path = param->path;
343 devid = param->openmount.devid; 302 devid = new_decode_dev(param->openmount.devid);
344 303
345 err = 0; 304 err = 0;
346 fd = autofs_dev_ioctl_open_mountpoint(path, devid); 305 fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -475,8 +434,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
475 struct autofs_dev_ioctl *param) 434 struct autofs_dev_ioctl *param)
476{ 435{
477 struct autofs_info *ino; 436 struct autofs_info *ino;
478 struct nameidata nd; 437 struct path path;
479 const char *path;
480 dev_t devid; 438 dev_t devid;
481 int err = -ENOENT; 439 int err = -ENOENT;
482 440
@@ -485,32 +443,24 @@ static int autofs_dev_ioctl_requester(struct file *fp,
485 goto out; 443 goto out;
486 } 444 }
487 445
488 path = param->path; 446 devid = sbi->sb->s_dev;
489 devid = new_encode_dev(sbi->sb->s_dev);
490 447
491 param->requester.uid = param->requester.gid = -1; 448 param->requester.uid = param->requester.gid = -1;
492 449
493 /* Get nameidata of the parent directory */ 450 err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
494 err = path_lookup(path, LOOKUP_PARENT, &nd);
495 if (err) 451 if (err)
496 goto out; 452 goto out;
497 453
498 err = autofs_dev_ioctl_find_super(&nd, devid); 454 ino = autofs4_dentry_ino(path.dentry);
499 if (err)
500 goto out_release;
501
502 ino = autofs4_dentry_ino(nd.path.dentry);
503 if (ino) { 455 if (ino) {
504 err = 0; 456 err = 0;
505 autofs4_expire_wait(nd.path.dentry); 457 autofs4_expire_wait(path.dentry);
506 spin_lock(&sbi->fs_lock); 458 spin_lock(&sbi->fs_lock);
507 param->requester.uid = ino->uid; 459 param->requester.uid = ino->uid;
508 param->requester.gid = ino->gid; 460 param->requester.gid = ino->gid;
509 spin_unlock(&sbi->fs_lock); 461 spin_unlock(&sbi->fs_lock);
510 } 462 }
511 463 path_put(&path);
512out_release:
513 path_put(&nd.path);
514out: 464out:
515 return err; 465 return err;
516} 466}
@@ -569,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
569 struct autofs_sb_info *sbi, 519 struct autofs_sb_info *sbi,
570 struct autofs_dev_ioctl *param) 520 struct autofs_dev_ioctl *param)
571{ 521{
572 struct nameidata nd; 522 struct path path;
573 const char *path; 523 const char *name;
574 unsigned int type; 524 unsigned int type;
575 unsigned int devid, magic; 525 unsigned int devid, magic;
576 int err = -ENOENT; 526 int err = -ENOENT;
@@ -580,71 +530,46 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
580 goto out; 530 goto out;
581 } 531 }
582 532
583 path = param->path; 533 name = param->path;
584 type = param->ismountpoint.in.type; 534 type = param->ismountpoint.in.type;
585 535
586 param->ismountpoint.out.devid = devid = 0; 536 param->ismountpoint.out.devid = devid = 0;
587 param->ismountpoint.out.magic = magic = 0; 537 param->ismountpoint.out.magic = magic = 0;
588 538
589 if (!fp || param->ioctlfd == -1) { 539 if (!fp || param->ioctlfd == -1) {
590 if (autofs_type_any(type)) { 540 if (autofs_type_any(type))
591 struct super_block *sb; 541 err = kern_path(name, LOOKUP_FOLLOW, &path);
592 542 else
593 err = path_lookup(path, LOOKUP_FOLLOW, &nd); 543 err = find_autofs_mount(name, &path, test_by_type, &type);
594 if (err) 544 if (err)
595 goto out; 545 goto out;
596 546 devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
597 sb = nd.path.dentry->d_sb;
598 devid = new_encode_dev(sb->s_dev);
599 } else {
600 struct autofs_info *ino;
601
602 err = path_lookup(path, LOOKUP_PARENT, &nd);
603 if (err)
604 goto out;
605
606 err = autofs_dev_ioctl_find_sbi_type(&nd, type);
607 if (err)
608 goto out_release;
609
610 ino = autofs4_dentry_ino(nd.path.dentry);
611 devid = autofs4_get_dev(ino->sbi);
612 }
613
614 err = 0; 547 err = 0;
615 if (nd.path.dentry->d_inode && 548 if (path.dentry->d_inode &&
616 nd.path.mnt->mnt_root == nd.path.dentry) { 549 path.mnt->mnt_root == path.dentry) {
617 err = 1; 550 err = 1;
618 magic = nd.path.dentry->d_inode->i_sb->s_magic; 551 magic = path.dentry->d_inode->i_sb->s_magic;
619 } 552 }
620 } else { 553 } else {
621 dev_t dev = autofs4_get_dev(sbi); 554 dev_t dev = sbi->sb->s_dev;
622 555
623 err = path_lookup(path, LOOKUP_PARENT, &nd); 556 err = find_autofs_mount(name, &path, test_by_dev, &dev);
624 if (err) 557 if (err)
625 goto out; 558 goto out;
626 559
627 err = autofs_dev_ioctl_find_super(&nd, dev); 560 devid = new_encode_dev(dev);
628 if (err)
629 goto out_release;
630
631 devid = dev;
632 561
633 err = have_submounts(nd.path.dentry); 562 err = have_submounts(path.dentry);
634 563
635 if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { 564 if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
636 if (follow_down(&nd.path.mnt, &nd.path.dentry)) { 565 if (follow_down(&path))
637 struct inode *inode = nd.path.dentry->d_inode; 566 magic = path.mnt->mnt_sb->s_magic;
638 magic = inode->i_sb->s_magic;
639 }
640 } 567 }
641 } 568 }
642 569
643 param->ismountpoint.out.devid = devid; 570 param->ismountpoint.out.devid = devid;
644 param->ismountpoint.out.magic = magic; 571 param->ismountpoint.out.magic = magic;
645 572 path_put(&path);
646out_release:
647 path_put(&nd.path);
648out: 573out:
649 return err; 574 return err;
650} 575}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3077d8f16523..aa39ae83f019 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry,
48static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) 48static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
49{ 49{
50 struct dentry *top = dentry; 50 struct dentry *top = dentry;
51 struct path path = {.mnt = mnt, .dentry = dentry};
51 int status = 1; 52 int status = 1;
52 53
53 DPRINTK("dentry %p %.*s", 54 DPRINTK("dentry %p %.*s",
54 dentry, (int)dentry->d_name.len, dentry->d_name.name); 55 dentry, (int)dentry->d_name.len, dentry->d_name.name);
55 56
56 mntget(mnt); 57 path_get(&path);
57 dget(dentry);
58 58
59 if (!follow_down(&mnt, &dentry)) 59 if (!follow_down(&path))
60 goto done; 60 goto done;
61 61
62 if (is_autofs4_dentry(dentry)) { 62 if (is_autofs4_dentry(path.dentry)) {
63 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 63 struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
64 64
65 /* This is an autofs submount, we can't expire it */ 65 /* This is an autofs submount, we can't expire it */
66 if (autofs_type_indirect(sbi->type)) 66 if (autofs_type_indirect(sbi->type))
@@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
70 * Otherwise it's an offset mount and we need to check 70 * Otherwise it's an offset mount and we need to check
71 * if we can umount its mount, if there is one. 71 * if we can umount its mount, if there is one.
72 */ 72 */
73 if (!d_mountpoint(dentry)) { 73 if (!d_mountpoint(path.dentry)) {
74 status = 0; 74 status = 0;
75 goto done; 75 goto done;
76 } 76 }
@@ -86,8 +86,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
86 status = 0; 86 status = 0;
87done: 87done:
88 DPRINTK("returning = %d", status); 88 DPRINTK("returning = %d", status);
89 dput(dentry); 89 path_put(&path);
90 mntput(mnt);
91 return status; 90 return status;
92} 91}
93 92
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e383bf0334f1..b96a3c57359d 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
181 nd->flags); 181 nd->flags);
182 /* 182 /*
183 * For an expire of a covered direct or offset mount we need 183 * For an expire of a covered direct or offset mount we need
184 * to beeak out of follow_down() at the autofs mount trigger 184 * to break out of follow_down() at the autofs mount trigger
185 * (d_mounted--), so we can see the expiring flag, and manage 185 * (d_mounted--), so we can see the expiring flag, and manage
186 * the blocking and following here until the expire is completed. 186 * the blocking and following here until the expire is completed.
187 */ 187 */
@@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
190 if (ino->flags & AUTOFS_INF_EXPIRING) { 190 if (ino->flags & AUTOFS_INF_EXPIRING) {
191 spin_unlock(&sbi->fs_lock); 191 spin_unlock(&sbi->fs_lock);
192 /* Follow down to our covering mount. */ 192 /* Follow down to our covering mount. */
193 if (!follow_down(&nd->path.mnt, &nd->path.dentry)) 193 if (!follow_down(&nd->path))
194 goto done; 194 goto done;
195 goto follow; 195 goto follow;
196 } 196 }
@@ -230,8 +230,7 @@ follow:
230 * to follow it. 230 * to follow it.
231 */ 231 */
232 if (d_mountpoint(dentry)) { 232 if (d_mountpoint(dentry)) {
233 if (!autofs4_follow_mount(&nd->path.mnt, 233 if (!autofs4_follow_mount(&nd->path)) {
234 &nd->path.dentry)) {
235 status = -ENOENT; 234 status = -ENOENT;
236 goto out_error; 235 goto out_error;
237 } 236 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 76afd0d6b86c..9367b6297d84 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,6 +737,8 @@ parse_options(char *options, befs_mount_options * opts)
737static void 737static void
738befs_put_super(struct super_block *sb) 738befs_put_super(struct super_block *sb)
739{ 739{
740 lock_kernel();
741
740 kfree(BEFS_SB(sb)->mount_opts.iocharset); 742 kfree(BEFS_SB(sb)->mount_opts.iocharset);
741 BEFS_SB(sb)->mount_opts.iocharset = NULL; 743 BEFS_SB(sb)->mount_opts.iocharset = NULL;
742 744
@@ -747,7 +749,8 @@ befs_put_super(struct super_block *sb)
747 749
748 kfree(sb->s_fs_info); 750 kfree(sb->s_fs_info);
749 sb->s_fs_info = NULL; 751 sb->s_fs_info = NULL;
750 return; 752
753 unlock_kernel();
751} 754}
752 755
753/* Allocate private field of the superblock, fill it. 756/* Allocate private field of the superblock, fill it.
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 4dd1b623f937..54bd07d44e68 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -79,7 +79,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
79const struct file_operations bfs_dir_operations = { 79const struct file_operations bfs_dir_operations = {
80 .read = generic_read_dir, 80 .read = generic_read_dir,
81 .readdir = bfs_readdir, 81 .readdir = bfs_readdir,
82 .fsync = file_fsync, 82 .fsync = simple_fsync,
83 .llseek = generic_file_llseek, 83 .llseek = generic_file_llseek,
84}; 84};
85 85
@@ -205,7 +205,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
205 inode->i_nlink = 1; 205 inode->i_nlink = 1;
206 } 206 }
207 de->ino = 0; 207 de->ino = 0;
208 mark_buffer_dirty(bh); 208 mark_buffer_dirty_inode(bh, dir);
209 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; 209 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
210 mark_inode_dirty(dir); 210 mark_inode_dirty(dir);
211 inode->i_ctime = dir->i_ctime; 211 inode->i_ctime = dir->i_ctime;
@@ -267,7 +267,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
267 new_inode->i_ctime = CURRENT_TIME_SEC; 267 new_inode->i_ctime = CURRENT_TIME_SEC;
268 inode_dec_link_count(new_inode); 268 inode_dec_link_count(new_inode);
269 } 269 }
270 mark_buffer_dirty(old_bh); 270 mark_buffer_dirty_inode(old_bh, old_dir);
271 error = 0; 271 error = 0;
272 272
273end_rename: 273end_rename:
@@ -320,7 +320,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
320 for (i = 0; i < BFS_NAMELEN; i++) 320 for (i = 0; i < BFS_NAMELEN; i++)
321 de->name[i] = 321 de->name[i] =
322 (i < namelen) ? name[i] : 0; 322 (i < namelen) ? name[i] : 0;
323 mark_buffer_dirty(bh); 323 mark_buffer_dirty_inode(bh, dir);
324 brelse(bh); 324 brelse(bh);
325 return 0; 325 return 0;
326 } 326 }
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index cc4062d12ca2..6f60336c6628 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,6 +30,7 @@ MODULE_LICENSE("GPL");
30#define dprintf(x...) 30#define dprintf(x...)
31#endif 31#endif
32 32
33static void bfs_write_super(struct super_block *s);
33void dump_imap(const char *prefix, struct super_block *s); 34void dump_imap(const char *prefix, struct super_block *s);
34 35
35struct inode *bfs_iget(struct super_block *sb, unsigned long ino) 36struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -97,14 +98,15 @@ error:
97 return ERR_PTR(-EIO); 98 return ERR_PTR(-EIO);
98} 99}
99 100
100static int bfs_write_inode(struct inode *inode, int unused) 101static int bfs_write_inode(struct inode *inode, int wait)
101{ 102{
103 struct bfs_sb_info *info = BFS_SB(inode->i_sb);
102 unsigned int ino = (u16)inode->i_ino; 104 unsigned int ino = (u16)inode->i_ino;
103 unsigned long i_sblock; 105 unsigned long i_sblock;
104 struct bfs_inode *di; 106 struct bfs_inode *di;
105 struct buffer_head *bh; 107 struct buffer_head *bh;
106 int block, off; 108 int block, off;
107 struct bfs_sb_info *info = BFS_SB(inode->i_sb); 109 int err = 0;
108 110
109 dprintf("ino=%08x\n", ino); 111 dprintf("ino=%08x\n", ino);
110 112
@@ -145,9 +147,14 @@ static int bfs_write_inode(struct inode *inode, int unused)
145 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); 147 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
146 148
147 mark_buffer_dirty(bh); 149 mark_buffer_dirty(bh);
150 if (wait) {
151 sync_dirty_buffer(bh);
152 if (buffer_req(bh) && !buffer_uptodate(bh))
153 err = -EIO;
154 }
148 brelse(bh); 155 brelse(bh);
149 mutex_unlock(&info->bfs_lock); 156 mutex_unlock(&info->bfs_lock);
150 return 0; 157 return err;
151} 158}
152 159
153static void bfs_delete_inode(struct inode *inode) 160static void bfs_delete_inode(struct inode *inode)
@@ -209,6 +216,26 @@ static void bfs_delete_inode(struct inode *inode)
209 clear_inode(inode); 216 clear_inode(inode);
210} 217}
211 218
219static int bfs_sync_fs(struct super_block *sb, int wait)
220{
221 struct bfs_sb_info *info = BFS_SB(sb);
222
223 mutex_lock(&info->bfs_lock);
224 mark_buffer_dirty(info->si_sbh);
225 sb->s_dirt = 0;
226 mutex_unlock(&info->bfs_lock);
227
228 return 0;
229}
230
231static void bfs_write_super(struct super_block *sb)
232{
233 if (!(sb->s_flags & MS_RDONLY))
234 bfs_sync_fs(sb, 1);
235 else
236 sb->s_dirt = 0;
237}
238
212static void bfs_put_super(struct super_block *s) 239static void bfs_put_super(struct super_block *s)
213{ 240{
214 struct bfs_sb_info *info = BFS_SB(s); 241 struct bfs_sb_info *info = BFS_SB(s);
@@ -216,11 +243,18 @@ static void bfs_put_super(struct super_block *s)
216 if (!info) 243 if (!info)
217 return; 244 return;
218 245
246 lock_kernel();
247
248 if (s->s_dirt)
249 bfs_write_super(s);
250
219 brelse(info->si_sbh); 251 brelse(info->si_sbh);
220 mutex_destroy(&info->bfs_lock); 252 mutex_destroy(&info->bfs_lock);
221 kfree(info->si_imap); 253 kfree(info->si_imap);
222 kfree(info); 254 kfree(info);
223 s->s_fs_info = NULL; 255 s->s_fs_info = NULL;
256
257 unlock_kernel();
224} 258}
225 259
226static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) 260static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -240,17 +274,6 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
240 return 0; 274 return 0;
241} 275}
242 276
243static void bfs_write_super(struct super_block *s)
244{
245 struct bfs_sb_info *info = BFS_SB(s);
246
247 mutex_lock(&info->bfs_lock);
248 if (!(s->s_flags & MS_RDONLY))
249 mark_buffer_dirty(info->si_sbh);
250 s->s_dirt = 0;
251 mutex_unlock(&info->bfs_lock);
252}
253
254static struct kmem_cache *bfs_inode_cachep; 277static struct kmem_cache *bfs_inode_cachep;
255 278
256static struct inode *bfs_alloc_inode(struct super_block *sb) 279static struct inode *bfs_alloc_inode(struct super_block *sb)
@@ -298,6 +321,7 @@ static const struct super_operations bfs_sops = {
298 .delete_inode = bfs_delete_inode, 321 .delete_inode = bfs_delete_inode,
299 .put_super = bfs_put_super, 322 .put_super = bfs_put_super,
300 .write_super = bfs_write_super, 323 .write_super = bfs_write_super,
324 .sync_fs = bfs_sync_fs,
301 .statfs = bfs_statfs, 325 .statfs = bfs_statfs,
302}; 326};
303 327
diff --git a/fs/bio.c b/fs/bio.c
index 98711647ece4..59000215e59b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
26#include <linux/mempool.h> 26#include <linux/mempool.h>
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <linux/blktrace_api.h> 28#include <linux/blktrace_api.h>
29#include <trace/block.h>
30#include <scsi/sg.h> /* for struct sg_iovec */ 29#include <scsi/sg.h> /* for struct sg_iovec */
31 30
32DEFINE_TRACE(block_split); 31#include <trace/events/block.h>
33 32
34/* 33/*
35 * Test patch to inline a certain number of bi_io_vec's inside the bio 34 * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -499,11 +498,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
499 struct request_queue *q = bdev_get_queue(bdev); 498 struct request_queue *q = bdev_get_queue(bdev);
500 int nr_pages; 499 int nr_pages;
501 500
502 nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 501 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
503 if (nr_pages > q->max_phys_segments) 502 if (nr_pages > queue_max_phys_segments(q))
504 nr_pages = q->max_phys_segments; 503 nr_pages = queue_max_phys_segments(q);
505 if (nr_pages > q->max_hw_segments) 504 if (nr_pages > queue_max_hw_segments(q))
506 nr_pages = q->max_hw_segments; 505 nr_pages = queue_max_hw_segments(q);
507 506
508 return nr_pages; 507 return nr_pages;
509} 508}
@@ -562,8 +561,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
562 * make this too complex. 561 * make this too complex.
563 */ 562 */
564 563
565 while (bio->bi_phys_segments >= q->max_phys_segments 564 while (bio->bi_phys_segments >= queue_max_phys_segments(q)
566 || bio->bi_phys_segments >= q->max_hw_segments) { 565 || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
567 566
568 if (retried_segments) 567 if (retried_segments)
569 return 0; 568 return 0;
@@ -634,7 +633,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
634int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, 633int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
635 unsigned int len, unsigned int offset) 634 unsigned int len, unsigned int offset)
636{ 635{
637 return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors); 636 return __bio_add_page(q, bio, page, len, offset,
637 queue_max_hw_sectors(q));
638} 638}
639 639
640/** 640/**
@@ -654,7 +654,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
654 unsigned int offset) 654 unsigned int offset)
655{ 655{
656 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 656 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
657 return __bio_add_page(q, bio, page, len, offset, q->max_sectors); 657 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
658} 658}
659 659
660struct bio_map_data { 660struct bio_map_data {
@@ -721,7 +721,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
721 721
722 while (bv_len && iov_idx < iov_count) { 722 while (bv_len && iov_idx < iov_count) {
723 unsigned int bytes; 723 unsigned int bytes;
724 char *iov_addr; 724 char __user *iov_addr;
725 725
726 bytes = min_t(unsigned int, 726 bytes = min_t(unsigned int,
727 iov[iov_idx].iov_len - iov_off, bv_len); 727 iov[iov_idx].iov_len - iov_off, bv_len);
@@ -1201,7 +1201,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
1201 char *addr = page_address(bvec->bv_page); 1201 char *addr = page_address(bvec->bv_page);
1202 int len = bmd->iovecs[i].bv_len; 1202 int len = bmd->iovecs[i].bv_len;
1203 1203
1204 if (read && !err) 1204 if (read)
1205 memcpy(p, addr, len); 1205 memcpy(p, addr, len);
1206 1206
1207 __free_page(bvec->bv_page); 1207 __free_page(bvec->bv_page);
@@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1490sector_t bio_sector_offset(struct bio *bio, unsigned short index, 1490sector_t bio_sector_offset(struct bio *bio, unsigned short index,
1491 unsigned int offset) 1491 unsigned int offset)
1492{ 1492{
1493 unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue); 1493 unsigned int sector_sz;
1494 struct bio_vec *bv; 1494 struct bio_vec *bv;
1495 sector_t sectors; 1495 sector_t sectors;
1496 int i; 1496 int i;
1497 1497
1498 sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
1498 sectors = 0; 1499 sectors = 0;
1499 1500
1500 if (index >= bio->bi_idx) 1501 if (index >= bio->bi_idx)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd17..3a6d4fb2a329 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
25#include <linux/uio.h> 25#include <linux/uio.h>
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/log2.h> 27#include <linux/log2.h>
28#include <linux/kmemleak.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include "internal.h" 30#include "internal.h"
30 31
@@ -76,7 +77,7 @@ int set_blocksize(struct block_device *bdev, int size)
76 return -EINVAL; 77 return -EINVAL;
77 78
78 /* Size cannot be smaller than the size supported by the device */ 79 /* Size cannot be smaller than the size supported by the device */
79 if (size < bdev_hardsect_size(bdev)) 80 if (size < bdev_logical_block_size(bdev))
80 return -EINVAL; 81 return -EINVAL;
81 82
82 /* Don't change the size if it is same as current */ 83 /* Don't change the size if it is same as current */
@@ -106,7 +107,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
106 107
107int sb_min_blocksize(struct super_block *sb, int size) 108int sb_min_blocksize(struct super_block *sb, int size)
108{ 109{
109 int minsize = bdev_hardsect_size(sb->s_bdev); 110 int minsize = bdev_logical_block_size(sb->s_bdev);
110 if (size < minsize) 111 if (size < minsize)
111 size = minsize; 112 size = minsize;
112 return sb_set_blocksize(sb, size); 113 return sb_set_blocksize(sb, size);
@@ -175,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
175 iov, offset, nr_segs, blkdev_get_blocks, NULL); 176 iov, offset, nr_segs, blkdev_get_blocks, NULL);
176} 177}
177 178
179int __sync_blockdev(struct block_device *bdev, int wait)
180{
181 if (!bdev)
182 return 0;
183 if (!wait)
184 return filemap_flush(bdev->bd_inode->i_mapping);
185 return filemap_write_and_wait(bdev->bd_inode->i_mapping);
186}
187
178/* 188/*
179 * Write out and wait upon all the dirty data associated with a block 189 * Write out and wait upon all the dirty data associated with a block
180 * device via its mapping. Does not take the superblock lock. 190 * device via its mapping. Does not take the superblock lock.
181 */ 191 */
182int sync_blockdev(struct block_device *bdev) 192int sync_blockdev(struct block_device *bdev)
183{ 193{
184 int ret = 0; 194 return __sync_blockdev(bdev, 1);
185
186 if (bdev)
187 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
188 return ret;
189} 195}
190EXPORT_SYMBOL(sync_blockdev); 196EXPORT_SYMBOL(sync_blockdev);
191 197
@@ -198,7 +204,7 @@ int fsync_bdev(struct block_device *bdev)
198{ 204{
199 struct super_block *sb = get_super(bdev); 205 struct super_block *sb = get_super(bdev);
200 if (sb) { 206 if (sb) {
201 int res = fsync_super(sb); 207 int res = sync_filesystem(sb);
202 drop_super(sb); 208 drop_super(sb);
203 return res; 209 return res;
204 } 210 }
@@ -240,7 +246,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
240 sb->s_frozen = SB_FREEZE_WRITE; 246 sb->s_frozen = SB_FREEZE_WRITE;
241 smp_wmb(); 247 smp_wmb();
242 248
243 __fsync_super(sb); 249 sync_filesystem(sb);
244 250
245 sb->s_frozen = SB_FREEZE_TRANS; 251 sb->s_frozen = SB_FREEZE_TRANS;
246 smp_wmb(); 252 smp_wmb();
@@ -492,6 +498,11 @@ void __init bdev_cache_init(void)
492 bd_mnt = kern_mount(&bd_type); 498 bd_mnt = kern_mount(&bd_type);
493 if (IS_ERR(bd_mnt)) 499 if (IS_ERR(bd_mnt))
494 panic("Cannot create bdev pseudo-fs"); 500 panic("Cannot create bdev pseudo-fs");
501 /*
502 * This vfsmount structure is only used to obtain the
503 * blockdev_superblock, so tell kmemleak not to report it.
504 */
505 kmemleak_not_leak(bd_mnt);
495 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 506 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
496} 507}
497 508
@@ -1111,7 +1122,7 @@ EXPORT_SYMBOL(check_disk_change);
1111 1122
1112void bd_set_size(struct block_device *bdev, loff_t size) 1123void bd_set_size(struct block_device *bdev, loff_t size)
1113{ 1124{
1114 unsigned bsize = bdev_hardsect_size(bdev); 1125 unsigned bsize = bdev_logical_block_size(bdev);
1115 1126
1116 bdev->bd_inode->i_size = size; 1127 bdev->bd_inode->i_size = size;
1117 while (bsize < PAGE_CACHE_SIZE) { 1128 while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 94212844a9bc..a35eb36b32fd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o \
10 compression.o delayed-ref.o 10 compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index cbba000dccbe..603972576f0f 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -351,9 +351,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
351 return 0; 351 return 0;
352} 352}
353 353
354int btrfs_check_acl(struct inode *inode, int mask)
355{
356 return 0;
357}
358
359#endif /* CONFIG_FS_POSIX_ACL */ 354#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 502c3d61de62..7f88628a1a72 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -294,10 +294,10 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
294 INIT_LIST_HEAD(&worker->worker_list); 294 INIT_LIST_HEAD(&worker->worker_list);
295 spin_lock_init(&worker->lock); 295 spin_lock_init(&worker->lock);
296 atomic_set(&worker->num_pending, 0); 296 atomic_set(&worker->num_pending, 0);
297 worker->workers = workers;
297 worker->task = kthread_run(worker_loop, worker, 298 worker->task = kthread_run(worker_loop, worker,
298 "btrfs-%s-%d", workers->name, 299 "btrfs-%s-%d", workers->name,
299 workers->num_workers + i); 300 workers->num_workers + i);
300 worker->workers = workers;
301 if (IS_ERR(worker->task)) { 301 if (IS_ERR(worker->task)) {
302 kfree(worker); 302 kfree(worker);
303 ret = PTR_ERR(worker->task); 303 ret = PTR_ERR(worker->task);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b30986f00b9d..acb4f3517582 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -72,6 +72,9 @@ struct btrfs_inode {
72 */ 72 */
73 struct list_head ordered_operations; 73 struct list_head ordered_operations;
74 74
75 /* node for the red-black tree that links inodes in subvolume root */
76 struct rb_node rb_node;
77
75 /* the space_info for where this inode's data allocations are done */ 78 /* the space_info for where this inode's data allocations are done */
76 struct btrfs_space_info *space_info; 79 struct btrfs_space_info *space_info;
77 80
@@ -154,5 +157,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
154 BTRFS_I(inode)->disk_i_size = size; 157 BTRFS_I(inode)->disk_i_size = size;
155} 158}
156 159
157
158#endif 160#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ab07627084f1..de1e2fd32080 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode,
123 u32 csum; 123 u32 csum;
124 u32 *cb_sum = &cb->sums; 124 u32 *cb_sum = &cb->sums;
125 125
126 if (btrfs_test_flag(inode, NODATASUM)) 126 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
127 return 0; 127 return 0;
128 128
129 for (i = 0; i < cb->nr_pages; i++) { 129 for (i = 0; i < cb->nr_pages; i++) {
@@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
670 */ 670 */
671 atomic_inc(&cb->pending_bios); 671 atomic_inc(&cb->pending_bios);
672 672
673 if (!btrfs_test_flag(inode, NODATASUM)) { 673 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
674 btrfs_lookup_bio_sums(root, inode, comp_bio, 674 btrfs_lookup_bio_sums(root, inode, comp_bio,
675 sums); 675 sums);
676 } 676 }
@@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
697 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 697 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
698 BUG_ON(ret); 698 BUG_ON(ret);
699 699
700 if (!btrfs_test_flag(inode, NODATASUM)) 700 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
701 btrfs_lookup_bio_sums(root, inode, comp_bio, sums); 701 btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
702 702
703 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 703 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
deleted file mode 100644
index 6e1b3de36700..000000000000
--- a/fs/btrfs/crc32c.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CRC32C__
20#define __BTRFS_CRC32C__
21#include <linux/crc32c.h>
22
23/*
24 * this file used to do more for selecting the HW version of crc32c,
25 * perhaps it will one day again soon.
26 */
27#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
28#endif
29
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fedf8b9f03a2..60a45f3a4e91 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
197 u32 nritems; 197 u32 nritems;
198 int ret = 0; 198 int ret = 0;
199 int level; 199 int level;
200 struct btrfs_root *new_root; 200 struct btrfs_disk_key disk_key;
201
202 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
203 if (!new_root)
204 return -ENOMEM;
205
206 memcpy(new_root, root, sizeof(*new_root));
207 new_root->root_key.objectid = new_root_objectid;
208 201
209 WARN_ON(root->ref_cows && trans->transid != 202 WARN_ON(root->ref_cows && trans->transid !=
210 root->fs_info->running_transaction->transid); 203 root->fs_info->running_transaction->transid);
@@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
212 205
213 level = btrfs_header_level(buf); 206 level = btrfs_header_level(buf);
214 nritems = btrfs_header_nritems(buf); 207 nritems = btrfs_header_nritems(buf);
208 if (level == 0)
209 btrfs_item_key(buf, &disk_key, 0);
210 else
211 btrfs_node_key(buf, &disk_key, 0);
215 212
216 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, 213 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
217 new_root_objectid, trans->transid, 214 new_root_objectid, &disk_key, level,
218 level, buf->start, 0); 215 buf->start, 0);
219 if (IS_ERR(cow)) { 216 if (IS_ERR(cow))
220 kfree(new_root);
221 return PTR_ERR(cow); 217 return PTR_ERR(cow);
222 }
223 218
224 copy_extent_buffer(cow, buf, 0, 0, cow->len); 219 copy_extent_buffer(cow, buf, 0, 0, cow->len);
225 btrfs_set_header_bytenr(cow, cow->start); 220 btrfs_set_header_bytenr(cow, cow->start);
226 btrfs_set_header_generation(cow, trans->transid); 221 btrfs_set_header_generation(cow, trans->transid);
227 btrfs_set_header_owner(cow, new_root_objectid); 222 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
228 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); 223 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
224 BTRFS_HEADER_FLAG_RELOC);
225 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
226 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
227 else
228 btrfs_set_header_owner(cow, new_root_objectid);
229 229
230 write_extent_buffer(cow, root->fs_info->fsid, 230 write_extent_buffer(cow, root->fs_info->fsid,
231 (unsigned long)btrfs_header_fsid(cow), 231 (unsigned long)btrfs_header_fsid(cow),
232 BTRFS_FSID_SIZE); 232 BTRFS_FSID_SIZE);
233 233
234 WARN_ON(btrfs_header_generation(buf) > trans->transid); 234 WARN_ON(btrfs_header_generation(buf) > trans->transid);
235 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); 235 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
236 kfree(new_root); 236 ret = btrfs_inc_ref(trans, root, cow, 1);
237 else
238 ret = btrfs_inc_ref(trans, root, cow, 0);
237 239
238 if (ret) 240 if (ret)
239 return ret; 241 return ret;
@@ -244,6 +246,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
244} 246}
245 247
246/* 248/*
249 * check if the tree block can be shared by multiple trees
250 */
251int btrfs_block_can_be_shared(struct btrfs_root *root,
252 struct extent_buffer *buf)
253{
254 /*
255 * Tree blocks not in refernece counted trees and tree roots
256 * are never shared. If a block was allocated after the last
257 * snapshot and the block was not allocated by tree relocation,
258 * we know the block is not shared.
259 */
260 if (root->ref_cows &&
261 buf != root->node && buf != root->commit_root &&
262 (btrfs_header_generation(buf) <=
263 btrfs_root_last_snapshot(&root->root_item) ||
264 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
265 return 1;
266#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
267 if (root->ref_cows &&
268 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
269 return 1;
270#endif
271 return 0;
272}
273
274static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
275 struct btrfs_root *root,
276 struct extent_buffer *buf,
277 struct extent_buffer *cow)
278{
279 u64 refs;
280 u64 owner;
281 u64 flags;
282 u64 new_flags = 0;
283 int ret;
284
285 /*
286 * Backrefs update rules:
287 *
288 * Always use full backrefs for extent pointers in tree block
289 * allocated by tree relocation.
290 *
291 * If a shared tree block is no longer referenced by its owner
292 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
293 * use full backrefs for extent pointers in tree block.
294 *
295 * If a tree block is been relocating
296 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
297 * use full backrefs for extent pointers in tree block.
298 * The reason for this is some operations (such as drop tree)
299 * are only allowed for blocks use full backrefs.
300 */
301
302 if (btrfs_block_can_be_shared(root, buf)) {
303 ret = btrfs_lookup_extent_info(trans, root, buf->start,
304 buf->len, &refs, &flags);
305 BUG_ON(ret);
306 BUG_ON(refs == 0);
307 } else {
308 refs = 1;
309 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
310 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
311 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
312 else
313 flags = 0;
314 }
315
316 owner = btrfs_header_owner(buf);
317 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
318 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
319
320 if (refs > 1) {
321 if ((owner == root->root_key.objectid ||
322 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
323 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
324 ret = btrfs_inc_ref(trans, root, buf, 1);
325 BUG_ON(ret);
326
327 if (root->root_key.objectid ==
328 BTRFS_TREE_RELOC_OBJECTID) {
329 ret = btrfs_dec_ref(trans, root, buf, 0);
330 BUG_ON(ret);
331 ret = btrfs_inc_ref(trans, root, cow, 1);
332 BUG_ON(ret);
333 }
334 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
335 } else {
336
337 if (root->root_key.objectid ==
338 BTRFS_TREE_RELOC_OBJECTID)
339 ret = btrfs_inc_ref(trans, root, cow, 1);
340 else
341 ret = btrfs_inc_ref(trans, root, cow, 0);
342 BUG_ON(ret);
343 }
344 if (new_flags != 0) {
345 ret = btrfs_set_disk_extent_flags(trans, root,
346 buf->start,
347 buf->len,
348 new_flags, 0);
349 BUG_ON(ret);
350 }
351 } else {
352 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
353 if (root->root_key.objectid ==
354 BTRFS_TREE_RELOC_OBJECTID)
355 ret = btrfs_inc_ref(trans, root, cow, 1);
356 else
357 ret = btrfs_inc_ref(trans, root, cow, 0);
358 BUG_ON(ret);
359 ret = btrfs_dec_ref(trans, root, buf, 1);
360 BUG_ON(ret);
361 }
362 clean_tree_block(trans, root, buf);
363 }
364 return 0;
365}
366
367/*
247 * does the dirty work in cow of a single block. The parent block (if 368 * does the dirty work in cow of a single block. The parent block (if
248 * supplied) is updated to point to the new cow copy. The new buffer is marked 369 * supplied) is updated to point to the new cow copy. The new buffer is marked
249 * dirty and returned locked. If you modify the block it needs to be marked 370 * dirty and returned locked. If you modify the block it needs to be marked
@@ -262,34 +383,39 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
262 struct extent_buffer **cow_ret, 383 struct extent_buffer **cow_ret,
263 u64 search_start, u64 empty_size) 384 u64 search_start, u64 empty_size)
264{ 385{
265 u64 parent_start; 386 struct btrfs_disk_key disk_key;
266 struct extent_buffer *cow; 387 struct extent_buffer *cow;
267 u32 nritems;
268 int ret = 0;
269 int level; 388 int level;
270 int unlock_orig = 0; 389 int unlock_orig = 0;
390 u64 parent_start;
271 391
272 if (*cow_ret == buf) 392 if (*cow_ret == buf)
273 unlock_orig = 1; 393 unlock_orig = 1;
274 394
275 btrfs_assert_tree_locked(buf); 395 btrfs_assert_tree_locked(buf);
276 396
277 if (parent)
278 parent_start = parent->start;
279 else
280 parent_start = 0;
281
282 WARN_ON(root->ref_cows && trans->transid != 397 WARN_ON(root->ref_cows && trans->transid !=
283 root->fs_info->running_transaction->transid); 398 root->fs_info->running_transaction->transid);
284 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 399 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
285 400
286 level = btrfs_header_level(buf); 401 level = btrfs_header_level(buf);
287 nritems = btrfs_header_nritems(buf);
288 402
289 cow = btrfs_alloc_free_block(trans, root, buf->len, 403 if (level == 0)
290 parent_start, root->root_key.objectid, 404 btrfs_item_key(buf, &disk_key, 0);
291 trans->transid, level, 405 else
292 search_start, empty_size); 406 btrfs_node_key(buf, &disk_key, 0);
407
408 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
409 if (parent)
410 parent_start = parent->start;
411 else
412 parent_start = 0;
413 } else
414 parent_start = 0;
415
416 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
417 root->root_key.objectid, &disk_key,
418 level, search_start, empty_size);
293 if (IS_ERR(cow)) 419 if (IS_ERR(cow))
294 return PTR_ERR(cow); 420 return PTR_ERR(cow);
295 421
@@ -298,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
298 copy_extent_buffer(cow, buf, 0, 0, cow->len); 424 copy_extent_buffer(cow, buf, 0, 0, cow->len);
299 btrfs_set_header_bytenr(cow, cow->start); 425 btrfs_set_header_bytenr(cow, cow->start);
300 btrfs_set_header_generation(cow, trans->transid); 426 btrfs_set_header_generation(cow, trans->transid);
301 btrfs_set_header_owner(cow, root->root_key.objectid); 427 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
302 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); 428 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
429 BTRFS_HEADER_FLAG_RELOC);
430 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
431 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
432 else
433 btrfs_set_header_owner(cow, root->root_key.objectid);
303 434
304 write_extent_buffer(cow, root->fs_info->fsid, 435 write_extent_buffer(cow, root->fs_info->fsid,
305 (unsigned long)btrfs_header_fsid(cow), 436 (unsigned long)btrfs_header_fsid(cow),
306 BTRFS_FSID_SIZE); 437 BTRFS_FSID_SIZE);
307 438
308 WARN_ON(btrfs_header_generation(buf) > trans->transid); 439 update_ref_for_cow(trans, root, buf, cow);
309 if (btrfs_header_generation(buf) != trans->transid) {
310 u32 nr_extents;
311 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
312 if (ret)
313 return ret;
314
315 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
316 WARN_ON(ret);
317 } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
318 /*
319 * There are only two places that can drop reference to
320 * tree blocks owned by living reloc trees, one is here,
321 * the other place is btrfs_drop_subtree. In both places,
322 * we check reference count while tree block is locked.
323 * Furthermore, if reference count is one, it won't get
324 * increased by someone else.
325 */
326 u32 refs;
327 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
328 buf->len, &refs);
329 BUG_ON(ret);
330 if (refs == 1) {
331 ret = btrfs_update_ref(trans, root, buf, cow,
332 0, nritems);
333 clean_tree_block(trans, root, buf);
334 } else {
335 ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
336 }
337 BUG_ON(ret);
338 } else {
339 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
340 if (ret)
341 return ret;
342 clean_tree_block(trans, root, buf);
343 }
344
345 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
346 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
347 WARN_ON(ret);
348 }
349 440
350 if (buf == root->node) { 441 if (buf == root->node) {
351 WARN_ON(parent && parent != buf); 442 WARN_ON(parent && parent != buf);
443 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
444 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
445 parent_start = buf->start;
446 else
447 parent_start = 0;
352 448
353 spin_lock(&root->node_lock); 449 spin_lock(&root->node_lock);
354 root->node = cow; 450 root->node = cow;
355 extent_buffer_get(cow); 451 extent_buffer_get(cow);
356 spin_unlock(&root->node_lock); 452 spin_unlock(&root->node_lock);
357 453
358 if (buf != root->commit_root) { 454 btrfs_free_extent(trans, root, buf->start, buf->len,
359 btrfs_free_extent(trans, root, buf->start, 455 parent_start, root->root_key.objectid,
360 buf->len, buf->start, 456 level, 0);
361 root->root_key.objectid,
362 btrfs_header_generation(buf),
363 level, 1);
364 }
365 free_extent_buffer(buf); 457 free_extent_buffer(buf);
366 add_root_to_dirty_list(root); 458 add_root_to_dirty_list(root);
367 } else { 459 } else {
460 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
461 parent_start = parent->start;
462 else
463 parent_start = 0;
464
465 WARN_ON(trans->transid != btrfs_header_generation(parent));
368 btrfs_set_node_blockptr(parent, parent_slot, 466 btrfs_set_node_blockptr(parent, parent_slot,
369 cow->start); 467 cow->start);
370 WARN_ON(trans->transid == 0);
371 btrfs_set_node_ptr_generation(parent, parent_slot, 468 btrfs_set_node_ptr_generation(parent, parent_slot,
372 trans->transid); 469 trans->transid);
373 btrfs_mark_buffer_dirty(parent); 470 btrfs_mark_buffer_dirty(parent);
374 WARN_ON(btrfs_header_generation(parent) != trans->transid);
375 btrfs_free_extent(trans, root, buf->start, buf->len, 471 btrfs_free_extent(trans, root, buf->start, buf->len,
376 parent_start, btrfs_header_owner(parent), 472 parent_start, root->root_key.objectid,
377 btrfs_header_generation(parent), level, 1); 473 level, 0);
378 } 474 }
379 if (unlock_orig) 475 if (unlock_orig)
380 btrfs_tree_unlock(buf); 476 btrfs_tree_unlock(buf);
@@ -384,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
384 return 0; 480 return 0;
385} 481}
386 482
483static inline int should_cow_block(struct btrfs_trans_handle *trans,
484 struct btrfs_root *root,
485 struct extent_buffer *buf)
486{
487 if (btrfs_header_generation(buf) == trans->transid &&
488 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
489 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
490 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
491 return 0;
492 return 1;
493}
494
387/* 495/*
388 * cows a single block, see __btrfs_cow_block for the real work. 496 * cows a single block, see __btrfs_cow_block for the real work.
389 * This version of it has extra checks so that a block isn't cow'd more than 497 * This version of it has extra checks so that a block isn't cow'd more than
@@ -411,9 +519,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
411 WARN_ON(1); 519 WARN_ON(1);
412 } 520 }
413 521
414 if (btrfs_header_generation(buf) == trans->transid && 522 if (!should_cow_block(trans, root, buf)) {
415 btrfs_header_owner(buf) == root->root_key.objectid &&
416 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
417 *cow_ret = buf; 523 *cow_ret = buf;
418 return 0; 524 return 0;
419 } 525 }
@@ -469,7 +575,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
469/* 575/*
470 * same as comp_keys only with two btrfs_key's 576 * same as comp_keys only with two btrfs_key's
471 */ 577 */
472static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) 578int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
473{ 579{
474 if (k1->objectid > k2->objectid) 580 if (k1->objectid > k2->objectid)
475 return 1; 581 return 1;
@@ -845,6 +951,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
845 return -1; 951 return -1;
846} 952}
847 953
954int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
955 int level, int *slot)
956{
957 return bin_search(eb, key, level, slot);
958}
959
848/* given a node and slot number, this reads the blocks it points to. The 960/* given a node and slot number, this reads the blocks it points to. The
849 * extent buffer is returned with a reference taken (but unlocked). 961 * extent buffer is returned with a reference taken (but unlocked).
850 * NULL is returned on error. 962 * NULL is returned on error.
@@ -921,13 +1033,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
921 root->node = child; 1033 root->node = child;
922 spin_unlock(&root->node_lock); 1034 spin_unlock(&root->node_lock);
923 1035
924 ret = btrfs_update_extent_ref(trans, root, child->start,
925 child->len,
926 mid->start, child->start,
927 root->root_key.objectid,
928 trans->transid, level - 1);
929 BUG_ON(ret);
930
931 add_root_to_dirty_list(root); 1036 add_root_to_dirty_list(root);
932 btrfs_tree_unlock(child); 1037 btrfs_tree_unlock(child);
933 1038
@@ -938,9 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
938 /* once for the path */ 1043 /* once for the path */
939 free_extent_buffer(mid); 1044 free_extent_buffer(mid);
940 ret = btrfs_free_extent(trans, root, mid->start, mid->len, 1045 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
941 mid->start, root->root_key.objectid, 1046 0, root->root_key.objectid, level, 1);
942 btrfs_header_generation(mid),
943 level, 1);
944 /* once for the root ptr */ 1047 /* once for the root ptr */
945 free_extent_buffer(mid); 1048 free_extent_buffer(mid);
946 return ret; 1049 return ret;
@@ -949,8 +1052,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
949 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1052 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
950 return 0; 1053 return 0;
951 1054
952 if (trans->transaction->delayed_refs.flushing && 1055 if (btrfs_header_nritems(mid) > 2)
953 btrfs_header_nritems(mid) > 2)
954 return 0; 1056 return 0;
955 1057
956 if (btrfs_header_nritems(mid) < 2) 1058 if (btrfs_header_nritems(mid) < 2)
@@ -998,7 +1100,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
998 ret = wret; 1100 ret = wret;
999 if (btrfs_header_nritems(right) == 0) { 1101 if (btrfs_header_nritems(right) == 0) {
1000 u64 bytenr = right->start; 1102 u64 bytenr = right->start;
1001 u64 generation = btrfs_header_generation(parent);
1002 u32 blocksize = right->len; 1103 u32 blocksize = right->len;
1003 1104
1004 clean_tree_block(trans, root, right); 1105 clean_tree_block(trans, root, right);
@@ -1010,9 +1111,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1010 if (wret) 1111 if (wret)
1011 ret = wret; 1112 ret = wret;
1012 wret = btrfs_free_extent(trans, root, bytenr, 1113 wret = btrfs_free_extent(trans, root, bytenr,
1013 blocksize, parent->start, 1114 blocksize, 0,
1014 btrfs_header_owner(parent), 1115 root->root_key.objectid,
1015 generation, level, 1); 1116 level, 0);
1016 if (wret) 1117 if (wret)
1017 ret = wret; 1118 ret = wret;
1018 } else { 1119 } else {
@@ -1047,7 +1148,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1047 } 1148 }
1048 if (btrfs_header_nritems(mid) == 0) { 1149 if (btrfs_header_nritems(mid) == 0) {
1049 /* we've managed to empty the middle node, drop it */ 1150 /* we've managed to empty the middle node, drop it */
1050 u64 root_gen = btrfs_header_generation(parent);
1051 u64 bytenr = mid->start; 1151 u64 bytenr = mid->start;
1052 u32 blocksize = mid->len; 1152 u32 blocksize = mid->len;
1053 1153
@@ -1059,9 +1159,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1059 if (wret) 1159 if (wret)
1060 ret = wret; 1160 ret = wret;
1061 wret = btrfs_free_extent(trans, root, bytenr, blocksize, 1161 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1062 parent->start, 1162 0, root->root_key.objectid,
1063 btrfs_header_owner(parent), 1163 level, 0);
1064 root_gen, level, 1);
1065 if (wret) 1164 if (wret)
1066 ret = wret; 1165 ret = wret;
1067 } else { 1166 } else {
@@ -1437,7 +1536,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1437{ 1536{
1438 int i; 1537 int i;
1439 1538
1440 if (path->keep_locks || path->lowest_level) 1539 if (path->keep_locks)
1441 return; 1540 return;
1442 1541
1443 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1542 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1552,7 +1651,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
1552 } 1651 }
1553 b = p->nodes[level]; 1652 b = p->nodes[level];
1554 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1653 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1555 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { 1654 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1556 int sret; 1655 int sret;
1557 1656
1558 sret = reada_for_balance(root, p, level); 1657 sret = reada_for_balance(root, p, level);
@@ -1614,10 +1713,17 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1614 lowest_unlock = 2; 1713 lowest_unlock = 2;
1615 1714
1616again: 1715again:
1617 if (p->skip_locking) 1716 if (p->search_commit_root) {
1618 b = btrfs_root_node(root); 1717 b = root->commit_root;
1619 else 1718 extent_buffer_get(b);
1620 b = btrfs_lock_root_node(root); 1719 if (!p->skip_locking)
1720 btrfs_tree_lock(b);
1721 } else {
1722 if (p->skip_locking)
1723 b = btrfs_root_node(root);
1724 else
1725 b = btrfs_lock_root_node(root);
1726 }
1621 1727
1622 while (b) { 1728 while (b) {
1623 level = btrfs_header_level(b); 1729 level = btrfs_header_level(b);
@@ -1638,11 +1744,9 @@ again:
1638 * then we don't want to set the path blocking, 1744 * then we don't want to set the path blocking,
1639 * so we test it here 1745 * so we test it here
1640 */ 1746 */
1641 if (btrfs_header_generation(b) == trans->transid && 1747 if (!should_cow_block(trans, root, b))
1642 btrfs_header_owner(b) == root->root_key.objectid &&
1643 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1644 goto cow_done; 1748 goto cow_done;
1645 } 1749
1646 btrfs_set_path_blocking(p); 1750 btrfs_set_path_blocking(p);
1647 1751
1648 wret = btrfs_cow_block(trans, root, b, 1752 wret = btrfs_cow_block(trans, root, b,
@@ -1764,138 +1868,6 @@ done:
1764 return ret; 1868 return ret;
1765} 1869}
1766 1870
1767int btrfs_merge_path(struct btrfs_trans_handle *trans,
1768 struct btrfs_root *root,
1769 struct btrfs_key *node_keys,
1770 u64 *nodes, int lowest_level)
1771{
1772 struct extent_buffer *eb;
1773 struct extent_buffer *parent;
1774 struct btrfs_key key;
1775 u64 bytenr;
1776 u64 generation;
1777 u32 blocksize;
1778 int level;
1779 int slot;
1780 int key_match;
1781 int ret;
1782
1783 eb = btrfs_lock_root_node(root);
1784 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1785 BUG_ON(ret);
1786
1787 btrfs_set_lock_blocking(eb);
1788
1789 parent = eb;
1790 while (1) {
1791 level = btrfs_header_level(parent);
1792 if (level == 0 || level <= lowest_level)
1793 break;
1794
1795 ret = bin_search(parent, &node_keys[lowest_level], level,
1796 &slot);
1797 if (ret && slot > 0)
1798 slot--;
1799
1800 bytenr = btrfs_node_blockptr(parent, slot);
1801 if (nodes[level - 1] == bytenr)
1802 break;
1803
1804 blocksize = btrfs_level_size(root, level - 1);
1805 generation = btrfs_node_ptr_generation(parent, slot);
1806 btrfs_node_key_to_cpu(eb, &key, slot);
1807 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1808
1809 if (generation == trans->transid) {
1810 eb = read_tree_block(root, bytenr, blocksize,
1811 generation);
1812 btrfs_tree_lock(eb);
1813 btrfs_set_lock_blocking(eb);
1814 }
1815
1816 /*
1817 * if node keys match and node pointer hasn't been modified
1818 * in the running transaction, we can merge the path. for
1819 * blocks owened by reloc trees, the node pointer check is
1820 * skipped, this is because these blocks are fully controlled
1821 * by the space balance code, no one else can modify them.
1822 */
1823 if (!nodes[level - 1] || !key_match ||
1824 (generation == trans->transid &&
1825 btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1826 if (level == 1 || level == lowest_level + 1) {
1827 if (generation == trans->transid) {
1828 btrfs_tree_unlock(eb);
1829 free_extent_buffer(eb);
1830 }
1831 break;
1832 }
1833
1834 if (generation != trans->transid) {
1835 eb = read_tree_block(root, bytenr, blocksize,
1836 generation);
1837 btrfs_tree_lock(eb);
1838 btrfs_set_lock_blocking(eb);
1839 }
1840
1841 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1842 &eb);
1843 BUG_ON(ret);
1844
1845 if (root->root_key.objectid ==
1846 BTRFS_TREE_RELOC_OBJECTID) {
1847 if (!nodes[level - 1]) {
1848 nodes[level - 1] = eb->start;
1849 memcpy(&node_keys[level - 1], &key,
1850 sizeof(node_keys[0]));
1851 } else {
1852 WARN_ON(1);
1853 }
1854 }
1855
1856 btrfs_tree_unlock(parent);
1857 free_extent_buffer(parent);
1858 parent = eb;
1859 continue;
1860 }
1861
1862 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1863 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1864 btrfs_mark_buffer_dirty(parent);
1865
1866 ret = btrfs_inc_extent_ref(trans, root,
1867 nodes[level - 1],
1868 blocksize, parent->start,
1869 btrfs_header_owner(parent),
1870 btrfs_header_generation(parent),
1871 level - 1);
1872 BUG_ON(ret);
1873
1874 /*
1875 * If the block was created in the running transaction,
1876 * it's possible this is the last reference to it, so we
1877 * should drop the subtree.
1878 */
1879 if (generation == trans->transid) {
1880 ret = btrfs_drop_subtree(trans, root, eb, parent);
1881 BUG_ON(ret);
1882 btrfs_tree_unlock(eb);
1883 free_extent_buffer(eb);
1884 } else {
1885 ret = btrfs_free_extent(trans, root, bytenr,
1886 blocksize, parent->start,
1887 btrfs_header_owner(parent),
1888 btrfs_header_generation(parent),
1889 level - 1, 1);
1890 BUG_ON(ret);
1891 }
1892 break;
1893 }
1894 btrfs_tree_unlock(parent);
1895 free_extent_buffer(parent);
1896 return 0;
1897}
1898
1899/* 1871/*
1900 * adjust the pointers going up the tree, starting at level 1872 * adjust the pointers going up the tree, starting at level
1901 * making sure the right key of each node is points to 'key'. 1873 * making sure the right key of each node is points to 'key'.
@@ -2021,9 +1993,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
2021 btrfs_mark_buffer_dirty(src); 1993 btrfs_mark_buffer_dirty(src);
2022 btrfs_mark_buffer_dirty(dst); 1994 btrfs_mark_buffer_dirty(dst);
2023 1995
2024 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
2025 BUG_ON(ret);
2026
2027 return ret; 1996 return ret;
2028} 1997}
2029 1998
@@ -2083,9 +2052,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
2083 btrfs_mark_buffer_dirty(src); 2052 btrfs_mark_buffer_dirty(src);
2084 btrfs_mark_buffer_dirty(dst); 2053 btrfs_mark_buffer_dirty(dst);
2085 2054
2086 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
2087 BUG_ON(ret);
2088
2089 return ret; 2055 return ret;
2090} 2056}
2091 2057
@@ -2105,7 +2071,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2105 struct extent_buffer *c; 2071 struct extent_buffer *c;
2106 struct extent_buffer *old; 2072 struct extent_buffer *old;
2107 struct btrfs_disk_key lower_key; 2073 struct btrfs_disk_key lower_key;
2108 int ret;
2109 2074
2110 BUG_ON(path->nodes[level]); 2075 BUG_ON(path->nodes[level]);
2111 BUG_ON(path->nodes[level-1] != root->node); 2076 BUG_ON(path->nodes[level-1] != root->node);
@@ -2117,16 +2082,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2117 btrfs_node_key(lower, &lower_key, 0); 2082 btrfs_node_key(lower, &lower_key, 0);
2118 2083
2119 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2084 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2120 root->root_key.objectid, trans->transid, 2085 root->root_key.objectid, &lower_key,
2121 level, root->node->start, 0); 2086 level, root->node->start, 0);
2122 if (IS_ERR(c)) 2087 if (IS_ERR(c))
2123 return PTR_ERR(c); 2088 return PTR_ERR(c);
2124 2089
2125 memset_extent_buffer(c, 0, 0, root->nodesize); 2090 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2126 btrfs_set_header_nritems(c, 1); 2091 btrfs_set_header_nritems(c, 1);
2127 btrfs_set_header_level(c, level); 2092 btrfs_set_header_level(c, level);
2128 btrfs_set_header_bytenr(c, c->start); 2093 btrfs_set_header_bytenr(c, c->start);
2129 btrfs_set_header_generation(c, trans->transid); 2094 btrfs_set_header_generation(c, trans->transid);
2095 btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
2130 btrfs_set_header_owner(c, root->root_key.objectid); 2096 btrfs_set_header_owner(c, root->root_key.objectid);
2131 2097
2132 write_extent_buffer(c, root->fs_info->fsid, 2098 write_extent_buffer(c, root->fs_info->fsid,
@@ -2151,12 +2117,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2151 root->node = c; 2117 root->node = c;
2152 spin_unlock(&root->node_lock); 2118 spin_unlock(&root->node_lock);
2153 2119
2154 ret = btrfs_update_extent_ref(trans, root, lower->start,
2155 lower->len, lower->start, c->start,
2156 root->root_key.objectid,
2157 trans->transid, level - 1);
2158 BUG_ON(ret);
2159
2160 /* the super has an extra ref to root->node */ 2120 /* the super has an extra ref to root->node */
2161 free_extent_buffer(old); 2121 free_extent_buffer(old);
2162 2122
@@ -2233,7 +2193,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2233 ret = insert_new_root(trans, root, path, level + 1); 2193 ret = insert_new_root(trans, root, path, level + 1);
2234 if (ret) 2194 if (ret)
2235 return ret; 2195 return ret;
2236 } else if (!trans->transaction->delayed_refs.flushing) { 2196 } else {
2237 ret = push_nodes_for_insert(trans, root, path, level); 2197 ret = push_nodes_for_insert(trans, root, path, level);
2238 c = path->nodes[level]; 2198 c = path->nodes[level];
2239 if (!ret && btrfs_header_nritems(c) < 2199 if (!ret && btrfs_header_nritems(c) <
@@ -2244,20 +2204,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2244 } 2204 }
2245 2205
2246 c_nritems = btrfs_header_nritems(c); 2206 c_nritems = btrfs_header_nritems(c);
2207 mid = (c_nritems + 1) / 2;
2208 btrfs_node_key(c, &disk_key, mid);
2247 2209
2248 split = btrfs_alloc_free_block(trans, root, root->nodesize, 2210 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2249 path->nodes[level + 1]->start,
2250 root->root_key.objectid, 2211 root->root_key.objectid,
2251 trans->transid, level, c->start, 0); 2212 &disk_key, level, c->start, 0);
2252 if (IS_ERR(split)) 2213 if (IS_ERR(split))
2253 return PTR_ERR(split); 2214 return PTR_ERR(split);
2254 2215
2255 btrfs_set_header_flags(split, btrfs_header_flags(c)); 2216 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2256 btrfs_set_header_level(split, btrfs_header_level(c)); 2217 btrfs_set_header_level(split, btrfs_header_level(c));
2257 btrfs_set_header_bytenr(split, split->start); 2218 btrfs_set_header_bytenr(split, split->start);
2258 btrfs_set_header_generation(split, trans->transid); 2219 btrfs_set_header_generation(split, trans->transid);
2220 btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
2259 btrfs_set_header_owner(split, root->root_key.objectid); 2221 btrfs_set_header_owner(split, root->root_key.objectid);
2260 btrfs_set_header_flags(split, 0);
2261 write_extent_buffer(split, root->fs_info->fsid, 2222 write_extent_buffer(split, root->fs_info->fsid,
2262 (unsigned long)btrfs_header_fsid(split), 2223 (unsigned long)btrfs_header_fsid(split),
2263 BTRFS_FSID_SIZE); 2224 BTRFS_FSID_SIZE);
@@ -2265,7 +2226,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2265 (unsigned long)btrfs_header_chunk_tree_uuid(split), 2226 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2266 BTRFS_UUID_SIZE); 2227 BTRFS_UUID_SIZE);
2267 2228
2268 mid = (c_nritems + 1) / 2;
2269 2229
2270 copy_extent_buffer(split, c, 2230 copy_extent_buffer(split, c,
2271 btrfs_node_key_ptr_offset(0), 2231 btrfs_node_key_ptr_offset(0),
@@ -2278,16 +2238,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2278 btrfs_mark_buffer_dirty(c); 2238 btrfs_mark_buffer_dirty(c);
2279 btrfs_mark_buffer_dirty(split); 2239 btrfs_mark_buffer_dirty(split);
2280 2240
2281 btrfs_node_key(split, &disk_key, 0);
2282 wret = insert_ptr(trans, root, path, &disk_key, split->start, 2241 wret = insert_ptr(trans, root, path, &disk_key, split->start,
2283 path->slots[level + 1] + 1, 2242 path->slots[level + 1] + 1,
2284 level + 1); 2243 level + 1);
2285 if (wret) 2244 if (wret)
2286 ret = wret; 2245 ret = wret;
2287 2246
2288 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2289 BUG_ON(ret);
2290
2291 if (path->slots[level] >= mid) { 2247 if (path->slots[level] >= mid) {
2292 path->slots[level] -= mid; 2248 path->slots[level] -= mid;
2293 btrfs_tree_unlock(c); 2249 btrfs_tree_unlock(c);
@@ -2360,7 +2316,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2360 u32 right_nritems; 2316 u32 right_nritems;
2361 u32 data_end; 2317 u32 data_end;
2362 u32 this_item_size; 2318 u32 this_item_size;
2363 int ret;
2364 2319
2365 if (empty) 2320 if (empty)
2366 nr = 0; 2321 nr = 0;
@@ -2473,9 +2428,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2473 btrfs_mark_buffer_dirty(left); 2428 btrfs_mark_buffer_dirty(left);
2474 btrfs_mark_buffer_dirty(right); 2429 btrfs_mark_buffer_dirty(right);
2475 2430
2476 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2477 BUG_ON(ret);
2478
2479 btrfs_item_key(right, &disk_key, 0); 2431 btrfs_item_key(right, &disk_key, 0);
2480 btrfs_set_node_key(upper, &disk_key, slot + 1); 2432 btrfs_set_node_key(upper, &disk_key, slot + 1);
2481 btrfs_mark_buffer_dirty(upper); 2433 btrfs_mark_buffer_dirty(upper);
@@ -2720,10 +2672,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2720 if (right_nritems) 2672 if (right_nritems)
2721 btrfs_mark_buffer_dirty(right); 2673 btrfs_mark_buffer_dirty(right);
2722 2674
2723 ret = btrfs_update_ref(trans, root, right, left,
2724 old_left_nritems, push_items);
2725 BUG_ON(ret);
2726
2727 btrfs_item_key(right, &disk_key, 0); 2675 btrfs_item_key(right, &disk_key, 0);
2728 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2676 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2729 if (wret) 2677 if (wret)
@@ -2880,9 +2828,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2880 btrfs_mark_buffer_dirty(l); 2828 btrfs_mark_buffer_dirty(l);
2881 BUG_ON(path->slots[0] != slot); 2829 BUG_ON(path->slots[0] != slot);
2882 2830
2883 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2884 BUG_ON(ret);
2885
2886 if (mid <= slot) { 2831 if (mid <= slot) {
2887 btrfs_tree_unlock(path->nodes[0]); 2832 btrfs_tree_unlock(path->nodes[0]);
2888 free_extent_buffer(path->nodes[0]); 2833 free_extent_buffer(path->nodes[0]);
@@ -2911,6 +2856,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2911 struct btrfs_path *path, int data_size, 2856 struct btrfs_path *path, int data_size,
2912 int extend) 2857 int extend)
2913{ 2858{
2859 struct btrfs_disk_key disk_key;
2914 struct extent_buffer *l; 2860 struct extent_buffer *l;
2915 u32 nritems; 2861 u32 nritems;
2916 int mid; 2862 int mid;
@@ -2918,12 +2864,11 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2918 struct extent_buffer *right; 2864 struct extent_buffer *right;
2919 int ret = 0; 2865 int ret = 0;
2920 int wret; 2866 int wret;
2921 int double_split; 2867 int split;
2922 int num_doubles = 0; 2868 int num_doubles = 0;
2923 2869
2924 /* first try to make some room by pushing left and right */ 2870 /* first try to make some room by pushing left and right */
2925 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY && 2871 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2926 !trans->transaction->delayed_refs.flushing) {
2927 wret = push_leaf_right(trans, root, path, data_size, 0); 2872 wret = push_leaf_right(trans, root, path, data_size, 0);
2928 if (wret < 0) 2873 if (wret < 0)
2929 return wret; 2874 return wret;
@@ -2945,16 +2890,53 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2945 return ret; 2890 return ret;
2946 } 2891 }
2947again: 2892again:
2948 double_split = 0; 2893 split = 1;
2949 l = path->nodes[0]; 2894 l = path->nodes[0];
2950 slot = path->slots[0]; 2895 slot = path->slots[0];
2951 nritems = btrfs_header_nritems(l); 2896 nritems = btrfs_header_nritems(l);
2952 mid = (nritems + 1) / 2; 2897 mid = (nritems + 1) / 2;
2953 2898
2954 right = btrfs_alloc_free_block(trans, root, root->leafsize, 2899 if (mid <= slot) {
2955 path->nodes[1]->start, 2900 if (nritems == 1 ||
2901 leaf_space_used(l, mid, nritems - mid) + data_size >
2902 BTRFS_LEAF_DATA_SIZE(root)) {
2903 if (slot >= nritems) {
2904 split = 0;
2905 } else {
2906 mid = slot;
2907 if (mid != nritems &&
2908 leaf_space_used(l, mid, nritems - mid) +
2909 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2910 split = 2;
2911 }
2912 }
2913 }
2914 } else {
2915 if (leaf_space_used(l, 0, mid) + data_size >
2916 BTRFS_LEAF_DATA_SIZE(root)) {
2917 if (!extend && data_size && slot == 0) {
2918 split = 0;
2919 } else if ((extend || !data_size) && slot == 0) {
2920 mid = 1;
2921 } else {
2922 mid = slot;
2923 if (mid != nritems &&
2924 leaf_space_used(l, mid, nritems - mid) +
2925 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2926 split = 2 ;
2927 }
2928 }
2929 }
2930 }
2931
2932 if (split == 0)
2933 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2934 else
2935 btrfs_item_key(l, &disk_key, mid);
2936
2937 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2956 root->root_key.objectid, 2938 root->root_key.objectid,
2957 trans->transid, 0, l->start, 0); 2939 &disk_key, 0, l->start, 0);
2958 if (IS_ERR(right)) { 2940 if (IS_ERR(right)) {
2959 BUG_ON(1); 2941 BUG_ON(1);
2960 return PTR_ERR(right); 2942 return PTR_ERR(right);
@@ -2963,6 +2945,7 @@ again:
2963 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 2945 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2964 btrfs_set_header_bytenr(right, right->start); 2946 btrfs_set_header_bytenr(right, right->start);
2965 btrfs_set_header_generation(right, trans->transid); 2947 btrfs_set_header_generation(right, trans->transid);
2948 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
2966 btrfs_set_header_owner(right, root->root_key.objectid); 2949 btrfs_set_header_owner(right, root->root_key.objectid);
2967 btrfs_set_header_level(right, 0); 2950 btrfs_set_header_level(right, 0);
2968 write_extent_buffer(right, root->fs_info->fsid, 2951 write_extent_buffer(right, root->fs_info->fsid,
@@ -2973,79 +2956,47 @@ again:
2973 (unsigned long)btrfs_header_chunk_tree_uuid(right), 2956 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2974 BTRFS_UUID_SIZE); 2957 BTRFS_UUID_SIZE);
2975 2958
2976 if (mid <= slot) { 2959 if (split == 0) {
2977 if (nritems == 1 || 2960 if (mid <= slot) {
2978 leaf_space_used(l, mid, nritems - mid) + data_size > 2961 btrfs_set_header_nritems(right, 0);
2979 BTRFS_LEAF_DATA_SIZE(root)) { 2962 wret = insert_ptr(trans, root, path,
2980 if (slot >= nritems) { 2963 &disk_key, right->start,
2981 struct btrfs_disk_key disk_key; 2964 path->slots[1] + 1, 1);
2982 2965 if (wret)
2983 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2966 ret = wret;
2984 btrfs_set_header_nritems(right, 0);
2985 wret = insert_ptr(trans, root, path,
2986 &disk_key, right->start,
2987 path->slots[1] + 1, 1);
2988 if (wret)
2989 ret = wret;
2990 2967
2991 btrfs_tree_unlock(path->nodes[0]); 2968 btrfs_tree_unlock(path->nodes[0]);
2992 free_extent_buffer(path->nodes[0]); 2969 free_extent_buffer(path->nodes[0]);
2993 path->nodes[0] = right; 2970 path->nodes[0] = right;
2994 path->slots[0] = 0; 2971 path->slots[0] = 0;
2995 path->slots[1] += 1; 2972 path->slots[1] += 1;
2996 btrfs_mark_buffer_dirty(right); 2973 } else {
2997 return ret; 2974 btrfs_set_header_nritems(right, 0);
2998 } 2975 wret = insert_ptr(trans, root, path,
2999 mid = slot; 2976 &disk_key,
3000 if (mid != nritems && 2977 right->start,
3001 leaf_space_used(l, mid, nritems - mid) + 2978 path->slots[1], 1);
3002 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 2979 if (wret)
3003 double_split = 1; 2980 ret = wret;
3004 } 2981 btrfs_tree_unlock(path->nodes[0]);
3005 } 2982 free_extent_buffer(path->nodes[0]);
3006 } else { 2983 path->nodes[0] = right;
3007 if (leaf_space_used(l, 0, mid) + data_size > 2984 path->slots[0] = 0;
3008 BTRFS_LEAF_DATA_SIZE(root)) { 2985 if (path->slots[1] == 0) {
3009 if (!extend && data_size && slot == 0) { 2986 wret = fixup_low_keys(trans, root,
3010 struct btrfs_disk_key disk_key; 2987 path, &disk_key, 1);
3011
3012 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3013 btrfs_set_header_nritems(right, 0);
3014 wret = insert_ptr(trans, root, path,
3015 &disk_key,
3016 right->start,
3017 path->slots[1], 1);
3018 if (wret) 2988 if (wret)
3019 ret = wret; 2989 ret = wret;
3020 btrfs_tree_unlock(path->nodes[0]);
3021 free_extent_buffer(path->nodes[0]);
3022 path->nodes[0] = right;
3023 path->slots[0] = 0;
3024 if (path->slots[1] == 0) {
3025 wret = fixup_low_keys(trans, root,
3026 path, &disk_key, 1);
3027 if (wret)
3028 ret = wret;
3029 }
3030 btrfs_mark_buffer_dirty(right);
3031 return ret;
3032 } else if ((extend || !data_size) && slot == 0) {
3033 mid = 1;
3034 } else {
3035 mid = slot;
3036 if (mid != nritems &&
3037 leaf_space_used(l, mid, nritems - mid) +
3038 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3039 double_split = 1;
3040 }
3041 } 2990 }
3042 } 2991 }
2992 btrfs_mark_buffer_dirty(right);
2993 return ret;
3043 } 2994 }
3044 2995
3045 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); 2996 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
3046 BUG_ON(ret); 2997 BUG_ON(ret);
3047 2998
3048 if (double_split) { 2999 if (split == 2) {
3049 BUG_ON(num_doubles != 0); 3000 BUG_ON(num_doubles != 0);
3050 num_doubles++; 3001 num_doubles++;
3051 goto again; 3002 goto again;
@@ -3447,7 +3398,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3447 /* figure out how many keys we can insert in here */ 3398 /* figure out how many keys we can insert in here */
3448 total_data = data_size[0]; 3399 total_data = data_size[0];
3449 for (i = 1; i < nr; i++) { 3400 for (i = 1; i < nr; i++) {
3450 if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) 3401 if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3451 break; 3402 break;
3452 total_data += data_size[i]; 3403 total_data += data_size[i];
3453 } 3404 }
@@ -3745,9 +3696,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3745 3696
3746/* 3697/*
3747 * a helper function to delete the leaf pointed to by path->slots[1] and 3698 * a helper function to delete the leaf pointed to by path->slots[1] and
3748 * path->nodes[1]. bytenr is the node block pointer, but since the callers 3699 * path->nodes[1].
3749 * already know it, it is faster to have them pass it down than to
3750 * read it out of the node again.
3751 * 3700 *
3752 * This deletes the pointer in path->nodes[1] and frees the leaf 3701 * This deletes the pointer in path->nodes[1] and frees the leaf
3753 * block extent. zero is returned if it all worked out, < 0 otherwise. 3702 * block extent. zero is returned if it all worked out, < 0 otherwise.
@@ -3755,15 +3704,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3755 * The path must have already been setup for deleting the leaf, including 3704 * The path must have already been setup for deleting the leaf, including
3756 * all the proper balancing. path->nodes[1] must be locked. 3705 * all the proper balancing. path->nodes[1] must be locked.
3757 */ 3706 */
3758noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 3707static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3759 struct btrfs_root *root, 3708 struct btrfs_root *root,
3760 struct btrfs_path *path, u64 bytenr) 3709 struct btrfs_path *path,
3710 struct extent_buffer *leaf)
3761{ 3711{
3762 int ret; 3712 int ret;
3763 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3764 u64 parent_start = path->nodes[1]->start;
3765 u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3766 3713
3714 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3767 ret = del_ptr(trans, root, path, 1, path->slots[1]); 3715 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3768 if (ret) 3716 if (ret)
3769 return ret; 3717 return ret;
@@ -3774,10 +3722,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3774 */ 3722 */
3775 btrfs_unlock_up_safe(path, 0); 3723 btrfs_unlock_up_safe(path, 0);
3776 3724
3777 ret = btrfs_free_extent(trans, root, bytenr, 3725 ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
3778 btrfs_level_size(root, 0), 3726 0, root->root_key.objectid, 0, 0);
3779 parent_start, parent_owner,
3780 root_gen, 0, 1);
3781 return ret; 3727 return ret;
3782} 3728}
3783/* 3729/*
@@ -3845,7 +3791,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3845 if (leaf == root->node) { 3791 if (leaf == root->node) {
3846 btrfs_set_header_level(leaf, 0); 3792 btrfs_set_header_level(leaf, 0);
3847 } else { 3793 } else {
3848 ret = btrfs_del_leaf(trans, root, path, leaf->start); 3794 ret = btrfs_del_leaf(trans, root, path, leaf);
3849 BUG_ON(ret); 3795 BUG_ON(ret);
3850 } 3796 }
3851 } else { 3797 } else {
@@ -3861,8 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3861 } 3807 }
3862 3808
3863 /* delete the leaf if it is mostly empty */ 3809 /* delete the leaf if it is mostly empty */
3864 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 && 3810 if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
3865 !trans->transaction->delayed_refs.flushing) {
3866 /* push_leaf_left fixes the path. 3811 /* push_leaf_left fixes the path.
3867 * make sure the path still points to our leaf 3812 * make sure the path still points to our leaf
3868 * for possible call to del_ptr below 3813 * for possible call to del_ptr below
@@ -3884,8 +3829,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3884 3829
3885 if (btrfs_header_nritems(leaf) == 0) { 3830 if (btrfs_header_nritems(leaf) == 0) {
3886 path->slots[1] = slot; 3831 path->slots[1] = slot;
3887 ret = btrfs_del_leaf(trans, root, path, 3832 ret = btrfs_del_leaf(trans, root, path, leaf);
3888 leaf->start);
3889 BUG_ON(ret); 3833 BUG_ON(ret);
3890 free_extent_buffer(leaf); 3834 free_extent_buffer(leaf);
3891 } else { 3835 } else {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4414a5d9983a..03441a99ea38 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,8 @@ struct btrfs_ordered_sum;
45 45
46#define BTRFS_MAX_LEVEL 8 46#define BTRFS_MAX_LEVEL 8
47 47
48#define BTRFS_COMPAT_EXTENT_TREE_V0
49
48/* 50/*
49 * files bigger than this get some pre-flushing when they are added 51 * files bigger than this get some pre-flushing when they are added
50 * to the ordered operations list. That way we limit the total 52 * to the ordered operations list. That way we limit the total
@@ -267,7 +269,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
267} 269}
268 270
269#define BTRFS_FSID_SIZE 16 271#define BTRFS_FSID_SIZE 16
270#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) 272#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
273#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
274#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
275#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
276
277#define BTRFS_BACKREF_REV_MAX 256
278#define BTRFS_BACKREF_REV_SHIFT 56
279#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
280 BTRFS_BACKREF_REV_SHIFT)
281
282#define BTRFS_OLD_BACKREF_REV 0
283#define BTRFS_MIXED_BACKREF_REV 1
271 284
272/* 285/*
273 * every tree block (leaf or node) starts with this header. 286 * every tree block (leaf or node) starts with this header.
@@ -296,7 +309,6 @@ struct btrfs_header {
296 sizeof(struct btrfs_item) - \ 309 sizeof(struct btrfs_item) - \
297 sizeof(struct btrfs_file_extent_item)) 310 sizeof(struct btrfs_file_extent_item))
298 311
299#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
300 312
301/* 313/*
302 * this is a very generous portion of the super block, giving us 314 * this is a very generous portion of the super block, giving us
@@ -355,9 +367,12 @@ struct btrfs_super_block {
355 * Compat flags that we support. If any incompat flags are set other than the 367 * Compat flags that we support. If any incompat flags are set other than the
356 * ones specified below then we will fail to mount 368 * ones specified below then we will fail to mount
357 */ 369 */
358#define BTRFS_FEATURE_COMPAT_SUPP 0x0 370#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
359#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 371
360#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 372#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
373#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
374#define BTRFS_FEATURE_INCOMPAT_SUPP \
375 BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
361 376
362/* 377/*
363 * A leaf is full of items. offset and size tell us where to find 378 * A leaf is full of items. offset and size tell us where to find
@@ -421,23 +436,65 @@ struct btrfs_path {
421 unsigned int keep_locks:1; 436 unsigned int keep_locks:1;
422 unsigned int skip_locking:1; 437 unsigned int skip_locking:1;
423 unsigned int leave_spinning:1; 438 unsigned int leave_spinning:1;
439 unsigned int search_commit_root:1;
424}; 440};
425 441
426/* 442/*
427 * items in the extent btree are used to record the objectid of the 443 * items in the extent btree are used to record the objectid of the
428 * owner of the block and the number of references 444 * owner of the block and the number of references
429 */ 445 */
446
430struct btrfs_extent_item { 447struct btrfs_extent_item {
448 __le64 refs;
449 __le64 generation;
450 __le64 flags;
451} __attribute__ ((__packed__));
452
453struct btrfs_extent_item_v0 {
431 __le32 refs; 454 __le32 refs;
432} __attribute__ ((__packed__)); 455} __attribute__ ((__packed__));
433 456
434struct btrfs_extent_ref { 457#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
458 sizeof(struct btrfs_item))
459
460#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0)
461#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1)
462
463/* following flags only apply to tree blocks */
464
465/* use full backrefs for extent pointers in the block */
466#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
467
468struct btrfs_tree_block_info {
469 struct btrfs_disk_key key;
470 u8 level;
471} __attribute__ ((__packed__));
472
473struct btrfs_extent_data_ref {
474 __le64 root;
475 __le64 objectid;
476 __le64 offset;
477 __le32 count;
478} __attribute__ ((__packed__));
479
480struct btrfs_shared_data_ref {
481 __le32 count;
482} __attribute__ ((__packed__));
483
484struct btrfs_extent_inline_ref {
485 u8 type;
486 u64 offset;
487} __attribute__ ((__packed__));
488
489/* old style backrefs item */
490struct btrfs_extent_ref_v0 {
435 __le64 root; 491 __le64 root;
436 __le64 generation; 492 __le64 generation;
437 __le64 objectid; 493 __le64 objectid;
438 __le32 num_refs; 494 __le32 count;
439} __attribute__ ((__packed__)); 495} __attribute__ ((__packed__));
440 496
497
441/* dev extents record free space on individual devices. The owner 498/* dev extents record free space on individual devices. The owner
442 * field points back to the chunk allocation mapping tree that allocated 499 * field points back to the chunk allocation mapping tree that allocated
443 * the extent. The chunk tree uuid field is a way to double check the owner 500 * the extent. The chunk tree uuid field is a way to double check the owner
@@ -695,12 +752,7 @@ struct btrfs_block_group_cache {
695 struct list_head cluster_list; 752 struct list_head cluster_list;
696}; 753};
697 754
698struct btrfs_leaf_ref_tree { 755struct reloc_control;
699 struct rb_root root;
700 struct list_head list;
701 spinlock_t lock;
702};
703
704struct btrfs_device; 756struct btrfs_device;
705struct btrfs_fs_devices; 757struct btrfs_fs_devices;
706struct btrfs_fs_info { 758struct btrfs_fs_info {
@@ -831,18 +883,11 @@ struct btrfs_fs_info {
831 struct task_struct *cleaner_kthread; 883 struct task_struct *cleaner_kthread;
832 int thread_pool_size; 884 int thread_pool_size;
833 885
834 /* tree relocation relocated fields */
835 struct list_head dead_reloc_roots;
836 struct btrfs_leaf_ref_tree reloc_ref_tree;
837 struct btrfs_leaf_ref_tree shared_ref_tree;
838
839 struct kobject super_kobj; 886 struct kobject super_kobj;
840 struct completion kobj_unregister; 887 struct completion kobj_unregister;
841 int do_barriers; 888 int do_barriers;
842 int closing; 889 int closing;
843 int log_root_recovering; 890 int log_root_recovering;
844 atomic_t throttles;
845 atomic_t throttle_gen;
846 891
847 u64 total_pinned; 892 u64 total_pinned;
848 893
@@ -861,6 +906,8 @@ struct btrfs_fs_info {
861 */ 906 */
862 struct list_head space_info; 907 struct list_head space_info;
863 908
909 struct reloc_control *reloc_ctl;
910
864 spinlock_t delalloc_lock; 911 spinlock_t delalloc_lock;
865 spinlock_t new_trans_lock; 912 spinlock_t new_trans_lock;
866 u64 delalloc_bytes; 913 u64 delalloc_bytes;
@@ -891,7 +938,6 @@ struct btrfs_fs_info {
891 * in ram representation of the tree. extent_root is used for all allocations 938 * in ram representation of the tree. extent_root is used for all allocations
892 * and for the extent tree extent_root root. 939 * and for the extent tree extent_root root.
893 */ 940 */
894struct btrfs_dirty_root;
895struct btrfs_root { 941struct btrfs_root {
896 struct extent_buffer *node; 942 struct extent_buffer *node;
897 943
@@ -899,9 +945,6 @@ struct btrfs_root {
899 spinlock_t node_lock; 945 spinlock_t node_lock;
900 946
901 struct extent_buffer *commit_root; 947 struct extent_buffer *commit_root;
902 struct btrfs_leaf_ref_tree *ref_tree;
903 struct btrfs_leaf_ref_tree ref_tree_struct;
904 struct btrfs_dirty_root *dirty_root;
905 struct btrfs_root *log_root; 948 struct btrfs_root *log_root;
906 struct btrfs_root *reloc_root; 949 struct btrfs_root *reloc_root;
907 950
@@ -952,10 +995,15 @@ struct btrfs_root {
952 /* the dirty list is only used by non-reference counted roots */ 995 /* the dirty list is only used by non-reference counted roots */
953 struct list_head dirty_list; 996 struct list_head dirty_list;
954 997
998 struct list_head root_list;
999
955 spinlock_t list_lock; 1000 spinlock_t list_lock;
956 struct list_head dead_list;
957 struct list_head orphan_list; 1001 struct list_head orphan_list;
958 1002
1003 spinlock_t inode_lock;
1004 /* red-black tree that keeps track of in-memory inodes */
1005 struct rb_root inode_tree;
1006
959 /* 1007 /*
960 * right now this just gets used so that a root has its own devid 1008 * right now this just gets used so that a root has its own devid
961 * for stat. It may be used for more later 1009 * for stat. It may be used for more later
@@ -1017,7 +1065,16 @@ struct btrfs_root {
1017 * are used, and how many references there are to each block 1065 * are used, and how many references there are to each block
1018 */ 1066 */
1019#define BTRFS_EXTENT_ITEM_KEY 168 1067#define BTRFS_EXTENT_ITEM_KEY 168
1020#define BTRFS_EXTENT_REF_KEY 180 1068
1069#define BTRFS_TREE_BLOCK_REF_KEY 176
1070
1071#define BTRFS_EXTENT_DATA_REF_KEY 178
1072
1073#define BTRFS_EXTENT_REF_V0_KEY 180
1074
1075#define BTRFS_SHARED_BLOCK_REF_KEY 182
1076
1077#define BTRFS_SHARED_DATA_REF_KEY 184
1021 1078
1022/* 1079/*
1023 * block groups give us hints into the extent allocation trees. Which 1080 * block groups give us hints into the extent allocation trees. Which
@@ -1043,6 +1100,8 @@ struct btrfs_root {
1043#define BTRFS_MOUNT_COMPRESS (1 << 5) 1100#define BTRFS_MOUNT_COMPRESS (1 << 5)
1044#define BTRFS_MOUNT_NOTREELOG (1 << 6) 1101#define BTRFS_MOUNT_NOTREELOG (1 << 6)
1045#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) 1102#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
1103#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
1104#define BTRFS_MOUNT_NOSSD (1 << 9)
1046 1105
1047#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1106#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1048#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1107#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1056,12 +1115,14 @@ struct btrfs_root {
1056#define BTRFS_INODE_READONLY (1 << 2) 1115#define BTRFS_INODE_READONLY (1 << 2)
1057#define BTRFS_INODE_NOCOMPRESS (1 << 3) 1116#define BTRFS_INODE_NOCOMPRESS (1 << 3)
1058#define BTRFS_INODE_PREALLOC (1 << 4) 1117#define BTRFS_INODE_PREALLOC (1 << 4)
1059#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ 1118#define BTRFS_INODE_SYNC (1 << 5)
1060 ~BTRFS_INODE_##flag) 1119#define BTRFS_INODE_IMMUTABLE (1 << 6)
1061#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ 1120#define BTRFS_INODE_APPEND (1 << 7)
1062 BTRFS_INODE_##flag) 1121#define BTRFS_INODE_NODUMP (1 << 8)
1063#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ 1122#define BTRFS_INODE_NOATIME (1 << 9)
1064 BTRFS_INODE_##flag) 1123#define BTRFS_INODE_DIRSYNC (1 << 10)
1124
1125
1065/* some macros to generate set/get funcs for the struct fields. This 1126/* some macros to generate set/get funcs for the struct fields. This
1066 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1127 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
1067 * one for u8: 1128 * one for u8:
@@ -1317,24 +1378,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1317 return (u8 *)((unsigned long)dev + ptr); 1378 return (u8 *)((unsigned long)dev + ptr);
1318} 1379}
1319 1380
1320/* struct btrfs_extent_ref */ 1381BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
1321BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); 1382BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
1322BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); 1383 generation, 64);
1323BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); 1384BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
1324BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
1325 1385
1326BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64); 1386BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
1327BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref, 1387
1328 generation, 64); 1388
1329BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref, 1389BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
1330 objectid, 64); 1390
1331BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref, 1391static inline void btrfs_tree_block_key(struct extent_buffer *eb,
1332 num_refs, 32); 1392 struct btrfs_tree_block_info *item,
1393 struct btrfs_disk_key *key)
1394{
1395 read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
1396}
1397
1398static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
1399 struct btrfs_tree_block_info *item,
1400 struct btrfs_disk_key *key)
1401{
1402 write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
1403}
1404
1405BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
1406 root, 64);
1407BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
1408 objectid, 64);
1409BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
1410 offset, 64);
1411BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
1412 count, 32);
1413
1414BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
1415 count, 32);
1333 1416
1334/* struct btrfs_extent_item */ 1417BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
1335BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); 1418 type, 8);
1336BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, 1419BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
1337 refs, 32); 1420 offset, 64);
1421
1422static inline u32 btrfs_extent_inline_ref_size(int type)
1423{
1424 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1425 type == BTRFS_SHARED_BLOCK_REF_KEY)
1426 return sizeof(struct btrfs_extent_inline_ref);
1427 if (type == BTRFS_SHARED_DATA_REF_KEY)
1428 return sizeof(struct btrfs_shared_data_ref) +
1429 sizeof(struct btrfs_extent_inline_ref);
1430 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1431 return sizeof(struct btrfs_extent_data_ref) +
1432 offsetof(struct btrfs_extent_inline_ref, offset);
1433 BUG();
1434 return 0;
1435}
1436
1437BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
1438BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
1439 generation, 64);
1440BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
1441BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
1338 1442
1339/* struct btrfs_node */ 1443/* struct btrfs_node */
1340BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); 1444BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
@@ -1558,6 +1662,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1558 return (flags & flag) == flag; 1662 return (flags & flag) == flag;
1559} 1663}
1560 1664
1665static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
1666{
1667 u64 flags = btrfs_header_flags(eb);
1668 return flags >> BTRFS_BACKREF_REV_SHIFT;
1669}
1670
1671static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
1672 int rev)
1673{
1674 u64 flags = btrfs_header_flags(eb);
1675 flags &= ~BTRFS_BACKREF_REV_MASK;
1676 flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
1677 btrfs_set_header_flags(eb, flags);
1678}
1679
1561static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) 1680static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1562{ 1681{
1563 unsigned long ptr = offsetof(struct btrfs_header, fsid); 1682 unsigned long ptr = offsetof(struct btrfs_header, fsid);
@@ -1790,39 +1909,32 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
1790int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1909int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1791 struct btrfs_root *root, struct extent_buffer *leaf); 1910 struct btrfs_root *root, struct extent_buffer *leaf);
1792int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1911int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1793 struct btrfs_root *root, u64 objectid, u64 bytenr); 1912 struct btrfs_root *root,
1913 u64 objectid, u64 offset, u64 bytenr);
1794int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); 1914int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1795struct btrfs_block_group_cache *btrfs_lookup_block_group( 1915struct btrfs_block_group_cache *btrfs_lookup_block_group(
1796 struct btrfs_fs_info *info, 1916 struct btrfs_fs_info *info,
1797 u64 bytenr); 1917 u64 bytenr);
1918void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1798u64 btrfs_find_block_group(struct btrfs_root *root, 1919u64 btrfs_find_block_group(struct btrfs_root *root,
1799 u64 search_start, u64 search_hint, int owner); 1920 u64 search_start, u64 search_hint, int owner);
1800struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 1921struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1801 struct btrfs_root *root, 1922 struct btrfs_root *root, u32 blocksize,
1802 u32 blocksize, u64 parent, 1923 u64 parent, u64 root_objectid,
1803 u64 root_objectid, 1924 struct btrfs_disk_key *key, int level,
1804 u64 ref_generation, 1925 u64 hint, u64 empty_size);
1805 int level,
1806 u64 hint,
1807 u64 empty_size);
1808struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 1926struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1809 struct btrfs_root *root, 1927 struct btrfs_root *root,
1810 u64 bytenr, u32 blocksize, 1928 u64 bytenr, u32 blocksize,
1811 int level); 1929 int level);
1812int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 1930int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
1813 struct btrfs_root *root, 1931 struct btrfs_root *root,
1814 u64 num_bytes, u64 parent, u64 min_bytes, 1932 u64 root_objectid, u64 owner,
1815 u64 root_objectid, u64 ref_generation, 1933 u64 offset, struct btrfs_key *ins);
1816 u64 owner, u64 empty_size, u64 hint_byte, 1934int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
1817 u64 search_end, struct btrfs_key *ins, u64 data); 1935 struct btrfs_root *root,
1818int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 1936 u64 root_objectid, u64 owner, u64 offset,
1819 struct btrfs_root *root, u64 parent, 1937 struct btrfs_key *ins);
1820 u64 root_objectid, u64 ref_generation,
1821 u64 owner, struct btrfs_key *ins);
1822int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1823 struct btrfs_root *root, u64 parent,
1824 u64 root_objectid, u64 ref_generation,
1825 u64 owner, struct btrfs_key *ins);
1826int btrfs_reserve_extent(struct btrfs_trans_handle *trans, 1938int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1827 struct btrfs_root *root, 1939 struct btrfs_root *root,
1828 u64 num_bytes, u64 min_alloc_size, 1940 u64 num_bytes, u64 min_alloc_size,
@@ -1830,18 +1942,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1830 u64 search_end, struct btrfs_key *ins, 1942 u64 search_end, struct btrfs_key *ins,
1831 u64 data); 1943 u64 data);
1832int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1944int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1833 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1945 struct extent_buffer *buf, int full_backref);
1834 u32 *nr_extents); 1946int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1835int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1947 struct extent_buffer *buf, int full_backref);
1836 struct extent_buffer *buf, u32 nr_extents); 1948int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
1837int btrfs_update_ref(struct btrfs_trans_handle *trans, 1949 struct btrfs_root *root,
1838 struct btrfs_root *root, struct extent_buffer *orig_buf, 1950 u64 bytenr, u64 num_bytes, u64 flags,
1839 struct extent_buffer *buf, int start_slot, int nr); 1951 int is_data);
1840int btrfs_free_extent(struct btrfs_trans_handle *trans, 1952int btrfs_free_extent(struct btrfs_trans_handle *trans,
1841 struct btrfs_root *root, 1953 struct btrfs_root *root,
1842 u64 bytenr, u64 num_bytes, u64 parent, 1954 u64 bytenr, u64 num_bytes, u64 parent,
1843 u64 root_objectid, u64 ref_generation, 1955 u64 root_objectid, u64 owner, u64 offset);
1844 u64 owner_objectid, int pin); 1956
1845int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 1957int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1846int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 1958int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1847 struct btrfs_root *root, 1959 struct btrfs_root *root,
@@ -1849,13 +1961,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1849int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1961int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1850 struct btrfs_root *root, 1962 struct btrfs_root *root,
1851 u64 bytenr, u64 num_bytes, u64 parent, 1963 u64 bytenr, u64 num_bytes, u64 parent,
1852 u64 root_objectid, u64 ref_generation, 1964 u64 root_objectid, u64 owner, u64 offset);
1853 u64 owner_objectid); 1965
1854int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1855 struct btrfs_root *root, u64 bytenr, u64 num_bytes,
1856 u64 orig_parent, u64 parent,
1857 u64 root_objectid, u64 ref_generation,
1858 u64 owner_objectid);
1859int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 1966int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1860 struct btrfs_root *root); 1967 struct btrfs_root *root);
1861int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); 1968int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
@@ -1867,16 +1974,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1867 u64 size); 1974 u64 size);
1868int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 1975int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1869 struct btrfs_root *root, u64 group_start); 1976 struct btrfs_root *root, u64 group_start);
1870int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); 1977int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
1871int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, 1978 struct btrfs_block_group_cache *group);
1872 struct btrfs_root *root); 1979
1873int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1874int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1875 struct btrfs_root *root,
1876 struct extent_buffer *buf, u64 orig_start);
1877int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1878int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1879int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1880u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 1980u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1881void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 1981void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
1882void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 1982void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -1891,13 +1991,12 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1891void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 1991void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1892 u64 bytes); 1992 u64 bytes);
1893/* ctree.c */ 1993/* ctree.c */
1994int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1995 int level, int *slot);
1996int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
1894int btrfs_previous_item(struct btrfs_root *root, 1997int btrfs_previous_item(struct btrfs_root *root,
1895 struct btrfs_path *path, u64 min_objectid, 1998 struct btrfs_path *path, u64 min_objectid,
1896 int type); 1999 int type);
1897int btrfs_merge_path(struct btrfs_trans_handle *trans,
1898 struct btrfs_root *root,
1899 struct btrfs_key *node_keys,
1900 u64 *nodes, int lowest_level);
1901int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, 2000int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1902 struct btrfs_root *root, struct btrfs_path *path, 2001 struct btrfs_root *root, struct btrfs_path *path,
1903 struct btrfs_key *new_key); 2002 struct btrfs_key *new_key);
@@ -1918,6 +2017,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
1918 struct btrfs_root *root, 2017 struct btrfs_root *root,
1919 struct extent_buffer *buf, 2018 struct extent_buffer *buf,
1920 struct extent_buffer **cow_ret, u64 new_root_objectid); 2019 struct extent_buffer **cow_ret, u64 new_root_objectid);
2020int btrfs_block_can_be_shared(struct btrfs_root *root,
2021 struct extent_buffer *buf);
1921int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root 2022int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1922 *root, struct btrfs_path *path, u32 data_size); 2023 *root, struct btrfs_path *path, u32 data_size);
1923int btrfs_truncate_item(struct btrfs_trans_handle *trans, 2024int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1944,9 +2045,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1944 2045
1945int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2046int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1946 struct btrfs_path *path, int slot, int nr); 2047 struct btrfs_path *path, int slot, int nr);
1947int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1948 struct btrfs_root *root,
1949 struct btrfs_path *path, u64 bytenr);
1950static inline int btrfs_del_item(struct btrfs_trans_handle *trans, 2048static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1951 struct btrfs_root *root, 2049 struct btrfs_root *root,
1952 struct btrfs_path *path) 2050 struct btrfs_path *path)
@@ -2005,8 +2103,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2005 btrfs_root_item *item, struct btrfs_key *key); 2103 btrfs_root_item *item, struct btrfs_key *key);
2006int btrfs_search_root(struct btrfs_root *root, u64 search_start, 2104int btrfs_search_root(struct btrfs_root *root, u64 search_start,
2007 u64 *found_objectid); 2105 u64 *found_objectid);
2008int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, 2106int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2009 struct btrfs_root *latest_root); 2107int btrfs_set_root_node(struct btrfs_root_item *item,
2108 struct extent_buffer *node);
2010/* dir-item.c */ 2109/* dir-item.c */
2011int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 2110int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
2012 struct btrfs_root *root, const char *name, 2111 struct btrfs_root *root, const char *name,
@@ -2139,7 +2238,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2139int btrfs_readpage(struct file *file, struct page *page); 2238int btrfs_readpage(struct file *file, struct page *page);
2140void btrfs_delete_inode(struct inode *inode); 2239void btrfs_delete_inode(struct inode *inode);
2141void btrfs_put_inode(struct inode *inode); 2240void btrfs_put_inode(struct inode *inode);
2142void btrfs_read_locked_inode(struct inode *inode);
2143int btrfs_write_inode(struct inode *inode, int wait); 2241int btrfs_write_inode(struct inode *inode, int wait);
2144void btrfs_dirty_inode(struct inode *inode); 2242void btrfs_dirty_inode(struct inode *inode);
2145struct inode *btrfs_alloc_inode(struct super_block *sb); 2243struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -2147,12 +2245,8 @@ void btrfs_destroy_inode(struct inode *inode);
2147int btrfs_init_cachep(void); 2245int btrfs_init_cachep(void);
2148void btrfs_destroy_cachep(void); 2246void btrfs_destroy_cachep(void);
2149long btrfs_ioctl_trans_end(struct file *file); 2247long btrfs_ioctl_trans_end(struct file *file);
2150struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2151 struct btrfs_root *root, int wait);
2152struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2153 struct btrfs_root *root);
2154struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2248struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2155 struct btrfs_root *root, int *is_new); 2249 struct btrfs_root *root);
2156int btrfs_commit_write(struct file *file, struct page *page, 2250int btrfs_commit_write(struct file *file, struct page *page,
2157 unsigned from, unsigned to); 2251 unsigned from, unsigned to);
2158struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2252struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2168,6 +2262,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size);
2168 2262
2169/* ioctl.c */ 2263/* ioctl.c */
2170long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2264long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2265void btrfs_update_iflags(struct inode *inode);
2266void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2171 2267
2172/* file.c */ 2268/* file.c */
2173int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2269int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
@@ -2205,8 +2301,20 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2205int btrfs_sync_fs(struct super_block *sb, int wait); 2301int btrfs_sync_fs(struct super_block *sb, int wait);
2206 2302
2207/* acl.c */ 2303/* acl.c */
2304#ifdef CONFIG_FS_POSIX_ACL
2208int btrfs_check_acl(struct inode *inode, int mask); 2305int btrfs_check_acl(struct inode *inode, int mask);
2306#else
2307#define btrfs_check_acl NULL
2308#endif
2209int btrfs_init_acl(struct inode *inode, struct inode *dir); 2309int btrfs_init_acl(struct inode *inode, struct inode *dir);
2210int btrfs_acl_chmod(struct inode *inode); 2310int btrfs_acl_chmod(struct inode *inode);
2211 2311
2312/* relocation.c */
2313int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
2314int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
2315 struct btrfs_root *root);
2316int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2317 struct btrfs_root *root);
2318int btrfs_recover_relocation(struct btrfs_root *root);
2319int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2212#endif 2320#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index d6c01c096a40..84e6781413b1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -29,27 +29,87 @@
29 * add extents in the middle of btrfs_search_slot, and it allows 29 * add extents in the middle of btrfs_search_slot, and it allows
30 * us to buffer up frequently modified backrefs in an rb tree instead 30 * us to buffer up frequently modified backrefs in an rb tree instead
31 * of hammering updates on the extent allocation tree. 31 * of hammering updates on the extent allocation tree.
32 *
33 * Right now this code is only used for reference counted trees, but
34 * the long term goal is to get rid of the similar code for delayed
35 * extent tree modifications.
36 */ 32 */
37 33
38/* 34/*
39 * entries in the rb tree are ordered by the byte number of the extent 35 * compare two delayed tree backrefs with same bytenr and type
40 * and by the byte number of the parent block. 36 */
37static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
38 struct btrfs_delayed_tree_ref *ref1)
39{
40 if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
41 if (ref1->root < ref2->root)
42 return -1;
43 if (ref1->root > ref2->root)
44 return 1;
45 } else {
46 if (ref1->parent < ref2->parent)
47 return -1;
48 if (ref1->parent > ref2->parent)
49 return 1;
50 }
51 return 0;
52}
53
54/*
55 * compare two delayed data backrefs with same bytenr and type
41 */ 56 */
42static int comp_entry(struct btrfs_delayed_ref_node *ref, 57static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
43 u64 bytenr, u64 parent) 58 struct btrfs_delayed_data_ref *ref1)
44{ 59{
45 if (bytenr < ref->bytenr) 60 if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
61 if (ref1->root < ref2->root)
62 return -1;
63 if (ref1->root > ref2->root)
64 return 1;
65 if (ref1->objectid < ref2->objectid)
66 return -1;
67 if (ref1->objectid > ref2->objectid)
68 return 1;
69 if (ref1->offset < ref2->offset)
70 return -1;
71 if (ref1->offset > ref2->offset)
72 return 1;
73 } else {
74 if (ref1->parent < ref2->parent)
75 return -1;
76 if (ref1->parent > ref2->parent)
77 return 1;
78 }
79 return 0;
80}
81
82/*
83 * entries in the rb tree are ordered by the byte number of the extent,
84 * type of the delayed backrefs and content of delayed backrefs.
85 */
86static int comp_entry(struct btrfs_delayed_ref_node *ref2,
87 struct btrfs_delayed_ref_node *ref1)
88{
89 if (ref1->bytenr < ref2->bytenr)
46 return -1; 90 return -1;
47 if (bytenr > ref->bytenr) 91 if (ref1->bytenr > ref2->bytenr)
48 return 1; 92 return 1;
49 if (parent < ref->parent) 93 if (ref1->is_head && ref2->is_head)
94 return 0;
95 if (ref2->is_head)
50 return -1; 96 return -1;
51 if (parent > ref->parent) 97 if (ref1->is_head)
52 return 1; 98 return 1;
99 if (ref1->type < ref2->type)
100 return -1;
101 if (ref1->type > ref2->type)
102 return 1;
103 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
104 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
105 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
106 btrfs_delayed_node_to_tree_ref(ref1));
107 } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
108 ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
109 return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
110 btrfs_delayed_node_to_data_ref(ref1));
111 }
112 BUG();
53 return 0; 113 return 0;
54} 114}
55 115
@@ -59,20 +119,21 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref,
59 * inserted. 119 * inserted.
60 */ 120 */
61static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, 121static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
62 u64 bytenr, u64 parent,
63 struct rb_node *node) 122 struct rb_node *node)
64{ 123{
65 struct rb_node **p = &root->rb_node; 124 struct rb_node **p = &root->rb_node;
66 struct rb_node *parent_node = NULL; 125 struct rb_node *parent_node = NULL;
67 struct btrfs_delayed_ref_node *entry; 126 struct btrfs_delayed_ref_node *entry;
127 struct btrfs_delayed_ref_node *ins;
68 int cmp; 128 int cmp;
69 129
130 ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
70 while (*p) { 131 while (*p) {
71 parent_node = *p; 132 parent_node = *p;
72 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 133 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
73 rb_node); 134 rb_node);
74 135
75 cmp = comp_entry(entry, bytenr, parent); 136 cmp = comp_entry(entry, ins);
76 if (cmp < 0) 137 if (cmp < 0)
77 p = &(*p)->rb_left; 138 p = &(*p)->rb_left;
78 else if (cmp > 0) 139 else if (cmp > 0)
@@ -81,18 +142,17 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
81 return entry; 142 return entry;
82 } 143 }
83 144
84 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
85 rb_link_node(node, parent_node, p); 145 rb_link_node(node, parent_node, p);
86 rb_insert_color(node, root); 146 rb_insert_color(node, root);
87 return NULL; 147 return NULL;
88} 148}
89 149
90/* 150/*
91 * find an entry based on (bytenr,parent). This returns the delayed 151 * find an head entry based on bytenr. This returns the delayed ref
92 * ref if it was able to find one, or NULL if nothing was in that spot 152 * head if it was able to find one, or NULL if nothing was in that spot
93 */ 153 */
94static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, 154static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
95 u64 bytenr, u64 parent, 155 u64 bytenr,
96 struct btrfs_delayed_ref_node **last) 156 struct btrfs_delayed_ref_node **last)
97{ 157{
98 struct rb_node *n = root->rb_node; 158 struct rb_node *n = root->rb_node;
@@ -105,7 +165,15 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
105 if (last) 165 if (last)
106 *last = entry; 166 *last = entry;
107 167
108 cmp = comp_entry(entry, bytenr, parent); 168 if (bytenr < entry->bytenr)
169 cmp = -1;
170 else if (bytenr > entry->bytenr)
171 cmp = 1;
172 else if (!btrfs_delayed_ref_is_head(entry))
173 cmp = 1;
174 else
175 cmp = 0;
176
109 if (cmp < 0) 177 if (cmp < 0)
110 n = n->rb_left; 178 n = n->rb_left;
111 else if (cmp > 0) 179 else if (cmp > 0)
@@ -154,7 +222,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
154 node = rb_first(&delayed_refs->root); 222 node = rb_first(&delayed_refs->root);
155 } else { 223 } else {
156 ref = NULL; 224 ref = NULL;
157 tree_search(&delayed_refs->root, start, (u64)-1, &ref); 225 find_ref_head(&delayed_refs->root, start, &ref);
158 if (ref) { 226 if (ref) {
159 struct btrfs_delayed_ref_node *tmp; 227 struct btrfs_delayed_ref_node *tmp;
160 228
@@ -234,7 +302,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
234 delayed_refs = &trans->transaction->delayed_refs; 302 delayed_refs = &trans->transaction->delayed_refs;
235 spin_lock(&delayed_refs->lock); 303 spin_lock(&delayed_refs->lock);
236 304
237 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 305 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
238 if (ref) { 306 if (ref) {
239 prev_node = rb_prev(&ref->rb_node); 307 prev_node = rb_prev(&ref->rb_node);
240 if (!prev_node) 308 if (!prev_node)
@@ -250,25 +318,28 @@ out:
250} 318}
251 319
252/* 320/*
253 * helper function to lookup reference count 321 * helper function to lookup reference count and flags of extent.
254 * 322 *
255 * the head node for delayed ref is used to store the sum of all the 323 * the head node for delayed ref is used to store the sum of all the
256 * reference count modifications queued up in the rbtree. This way you 324 * reference count modifications queued up in the rbtree. the head
257 * can check to see what the reference count would be if all of the 325 * node may also store the extent flags to set. This way you can check
258 * delayed refs are processed. 326 * to see what the reference count and extent flags would be if all of
327 * the delayed refs are not processed.
259 */ 328 */
260int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 329int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
261 struct btrfs_root *root, u64 bytenr, 330 struct btrfs_root *root, u64 bytenr,
262 u64 num_bytes, u32 *refs) 331 u64 num_bytes, u64 *refs, u64 *flags)
263{ 332{
264 struct btrfs_delayed_ref_node *ref; 333 struct btrfs_delayed_ref_node *ref;
265 struct btrfs_delayed_ref_head *head; 334 struct btrfs_delayed_ref_head *head;
266 struct btrfs_delayed_ref_root *delayed_refs; 335 struct btrfs_delayed_ref_root *delayed_refs;
267 struct btrfs_path *path; 336 struct btrfs_path *path;
268 struct extent_buffer *leaf;
269 struct btrfs_extent_item *ei; 337 struct btrfs_extent_item *ei;
338 struct extent_buffer *leaf;
270 struct btrfs_key key; 339 struct btrfs_key key;
271 u32 num_refs; 340 u32 item_size;
341 u64 num_refs;
342 u64 extent_flags;
272 int ret; 343 int ret;
273 344
274 path = btrfs_alloc_path(); 345 path = btrfs_alloc_path();
@@ -287,37 +358,60 @@ again:
287 358
288 if (ret == 0) { 359 if (ret == 0) {
289 leaf = path->nodes[0]; 360 leaf = path->nodes[0];
290 ei = btrfs_item_ptr(leaf, path->slots[0], 361 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
291 struct btrfs_extent_item); 362 if (item_size >= sizeof(*ei)) {
292 num_refs = btrfs_extent_refs(leaf, ei); 363 ei = btrfs_item_ptr(leaf, path->slots[0],
364 struct btrfs_extent_item);
365 num_refs = btrfs_extent_refs(leaf, ei);
366 extent_flags = btrfs_extent_flags(leaf, ei);
367 } else {
368#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
369 struct btrfs_extent_item_v0 *ei0;
370 BUG_ON(item_size != sizeof(*ei0));
371 ei0 = btrfs_item_ptr(leaf, path->slots[0],
372 struct btrfs_extent_item_v0);
373 num_refs = btrfs_extent_refs_v0(leaf, ei0);
374 /* FIXME: this isn't correct for data */
375 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
376#else
377 BUG();
378#endif
379 }
380 BUG_ON(num_refs == 0);
293 } else { 381 } else {
294 num_refs = 0; 382 num_refs = 0;
383 extent_flags = 0;
295 ret = 0; 384 ret = 0;
296 } 385 }
297 386
298 spin_lock(&delayed_refs->lock); 387 spin_lock(&delayed_refs->lock);
299 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 388 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
300 if (ref) { 389 if (ref) {
301 head = btrfs_delayed_node_to_head(ref); 390 head = btrfs_delayed_node_to_head(ref);
302 if (mutex_trylock(&head->mutex)) { 391 if (!mutex_trylock(&head->mutex)) {
303 num_refs += ref->ref_mod; 392 atomic_inc(&ref->refs);
304 mutex_unlock(&head->mutex); 393 spin_unlock(&delayed_refs->lock);
305 *refs = num_refs;
306 goto out;
307 }
308 394
309 atomic_inc(&ref->refs); 395 btrfs_release_path(root->fs_info->extent_root, path);
310 spin_unlock(&delayed_refs->lock);
311 396
312 btrfs_release_path(root->fs_info->extent_root, path); 397 mutex_lock(&head->mutex);
398 mutex_unlock(&head->mutex);
399 btrfs_put_delayed_ref(ref);
400 goto again;
401 }
402 if (head->extent_op && head->extent_op->update_flags)
403 extent_flags |= head->extent_op->flags_to_set;
404 else
405 BUG_ON(num_refs == 0);
313 406
314 mutex_lock(&head->mutex); 407 num_refs += ref->ref_mod;
315 mutex_unlock(&head->mutex); 408 mutex_unlock(&head->mutex);
316 btrfs_put_delayed_ref(ref);
317 goto again;
318 } else {
319 *refs = num_refs;
320 } 409 }
410 WARN_ON(num_refs == 0);
411 if (refs)
412 *refs = num_refs;
413 if (flags)
414 *flags = extent_flags;
321out: 415out:
322 spin_unlock(&delayed_refs->lock); 416 spin_unlock(&delayed_refs->lock);
323 btrfs_free_path(path); 417 btrfs_free_path(path);
@@ -338,16 +432,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
338 struct btrfs_delayed_ref_node *existing, 432 struct btrfs_delayed_ref_node *existing,
339 struct btrfs_delayed_ref_node *update) 433 struct btrfs_delayed_ref_node *update)
340{ 434{
341 struct btrfs_delayed_ref *existing_ref; 435 if (update->action != existing->action) {
342 struct btrfs_delayed_ref *ref;
343
344 existing_ref = btrfs_delayed_node_to_ref(existing);
345 ref = btrfs_delayed_node_to_ref(update);
346
347 if (ref->pin)
348 existing_ref->pin = 1;
349
350 if (ref->action != existing_ref->action) {
351 /* 436 /*
352 * this is effectively undoing either an add or a 437 * this is effectively undoing either an add or a
353 * drop. We decrement the ref_mod, and if it goes 438 * drop. We decrement the ref_mod, and if it goes
@@ -363,20 +448,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
363 delayed_refs->num_entries--; 448 delayed_refs->num_entries--;
364 if (trans->delayed_ref_updates) 449 if (trans->delayed_ref_updates)
365 trans->delayed_ref_updates--; 450 trans->delayed_ref_updates--;
451 } else {
452 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
453 existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
366 } 454 }
367 } else { 455 } else {
368 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { 456 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
369 /* if we're adding refs, make sure all the 457 existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
370 * details match up. The extent could
371 * have been totally freed and reallocated
372 * by a different owner before the delayed
373 * ref entries were removed.
374 */
375 existing_ref->owner_objectid = ref->owner_objectid;
376 existing_ref->generation = ref->generation;
377 existing_ref->root = ref->root;
378 existing->num_bytes = update->num_bytes;
379 }
380 /* 458 /*
381 * the action on the existing ref matches 459 * the action on the existing ref matches
382 * the action on the ref we're trying to add. 460 * the action on the ref we're trying to add.
@@ -401,6 +479,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
401 479
402 existing_ref = btrfs_delayed_node_to_head(existing); 480 existing_ref = btrfs_delayed_node_to_head(existing);
403 ref = btrfs_delayed_node_to_head(update); 481 ref = btrfs_delayed_node_to_head(update);
482 BUG_ON(existing_ref->is_data != ref->is_data);
404 483
405 if (ref->must_insert_reserved) { 484 if (ref->must_insert_reserved) {
406 /* if the extent was freed and then 485 /* if the extent was freed and then
@@ -420,6 +499,24 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
420 499
421 } 500 }
422 501
502 if (ref->extent_op) {
503 if (!existing_ref->extent_op) {
504 existing_ref->extent_op = ref->extent_op;
505 } else {
506 if (ref->extent_op->update_key) {
507 memcpy(&existing_ref->extent_op->key,
508 &ref->extent_op->key,
509 sizeof(ref->extent_op->key));
510 existing_ref->extent_op->update_key = 1;
511 }
512 if (ref->extent_op->update_flags) {
513 existing_ref->extent_op->flags_to_set |=
514 ref->extent_op->flags_to_set;
515 existing_ref->extent_op->update_flags = 1;
516 }
517 kfree(ref->extent_op);
518 }
519 }
423 /* 520 /*
424 * update the reference mod on the head to reflect this new operation 521 * update the reference mod on the head to reflect this new operation
425 */ 522 */
@@ -427,19 +524,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
427} 524}
428 525
429/* 526/*
430 * helper function to actually insert a delayed ref into the rbtree. 527 * helper function to actually insert a head node into the rbtree.
431 * this does all the dirty work in terms of maintaining the correct 528 * this does all the dirty work in terms of maintaining the correct
432 * overall modification count in the head node and properly dealing 529 * overall modification count.
433 * with updating existing nodes as new modifications are queued.
434 */ 530 */
435static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 531static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
436 struct btrfs_delayed_ref_node *ref, 532 struct btrfs_delayed_ref_node *ref,
437 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 533 u64 bytenr, u64 num_bytes,
438 u64 ref_generation, u64 owner_objectid, int action, 534 int action, int is_data)
439 int pin)
440{ 535{
441 struct btrfs_delayed_ref_node *existing; 536 struct btrfs_delayed_ref_node *existing;
442 struct btrfs_delayed_ref *full_ref;
443 struct btrfs_delayed_ref_head *head_ref = NULL; 537 struct btrfs_delayed_ref_head *head_ref = NULL;
444 struct btrfs_delayed_ref_root *delayed_refs; 538 struct btrfs_delayed_ref_root *delayed_refs;
445 int count_mod = 1; 539 int count_mod = 1;
@@ -449,12 +543,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
449 * the head node stores the sum of all the mods, so dropping a ref 543 * the head node stores the sum of all the mods, so dropping a ref
450 * should drop the sum in the head node by one. 544 * should drop the sum in the head node by one.
451 */ 545 */
452 if (parent == (u64)-1) { 546 if (action == BTRFS_UPDATE_DELAYED_HEAD)
453 if (action == BTRFS_DROP_DELAYED_REF) 547 count_mod = 0;
454 count_mod = -1; 548 else if (action == BTRFS_DROP_DELAYED_REF)
455 else if (action == BTRFS_UPDATE_DELAYED_HEAD) 549 count_mod = -1;
456 count_mod = 0;
457 }
458 550
459 /* 551 /*
460 * BTRFS_ADD_DELAYED_EXTENT means that we need to update 552 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
@@ -467,57 +559,148 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
467 * Once we record must_insert_reserved, switch the action to 559 * Once we record must_insert_reserved, switch the action to
468 * BTRFS_ADD_DELAYED_REF because other special casing is not required. 560 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
469 */ 561 */
470 if (action == BTRFS_ADD_DELAYED_EXTENT) { 562 if (action == BTRFS_ADD_DELAYED_EXTENT)
471 must_insert_reserved = 1; 563 must_insert_reserved = 1;
472 action = BTRFS_ADD_DELAYED_REF; 564 else
473 } else {
474 must_insert_reserved = 0; 565 must_insert_reserved = 0;
475 }
476
477 566
478 delayed_refs = &trans->transaction->delayed_refs; 567 delayed_refs = &trans->transaction->delayed_refs;
479 568
480 /* first set the basic ref node struct up */ 569 /* first set the basic ref node struct up */
481 atomic_set(&ref->refs, 1); 570 atomic_set(&ref->refs, 1);
482 ref->bytenr = bytenr; 571 ref->bytenr = bytenr;
483 ref->parent = parent; 572 ref->num_bytes = num_bytes;
484 ref->ref_mod = count_mod; 573 ref->ref_mod = count_mod;
574 ref->type = 0;
575 ref->action = 0;
576 ref->is_head = 1;
485 ref->in_tree = 1; 577 ref->in_tree = 1;
578
579 head_ref = btrfs_delayed_node_to_head(ref);
580 head_ref->must_insert_reserved = must_insert_reserved;
581 head_ref->is_data = is_data;
582
583 INIT_LIST_HEAD(&head_ref->cluster);
584 mutex_init(&head_ref->mutex);
585
586 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
587
588 if (existing) {
589 update_existing_head_ref(existing, ref);
590 /*
591 * we've updated the existing ref, free the newly
592 * allocated ref
593 */
594 kfree(ref);
595 } else {
596 delayed_refs->num_heads++;
597 delayed_refs->num_heads_ready++;
598 delayed_refs->num_entries++;
599 trans->delayed_ref_updates++;
600 }
601 return 0;
602}
603
604/*
605 * helper to insert a delayed tree ref into the rbtree.
606 */
607static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
608 struct btrfs_delayed_ref_node *ref,
609 u64 bytenr, u64 num_bytes, u64 parent,
610 u64 ref_root, int level, int action)
611{
612 struct btrfs_delayed_ref_node *existing;
613 struct btrfs_delayed_tree_ref *full_ref;
614 struct btrfs_delayed_ref_root *delayed_refs;
615
616 if (action == BTRFS_ADD_DELAYED_EXTENT)
617 action = BTRFS_ADD_DELAYED_REF;
618
619 delayed_refs = &trans->transaction->delayed_refs;
620
621 /* first set the basic ref node struct up */
622 atomic_set(&ref->refs, 1);
623 ref->bytenr = bytenr;
486 ref->num_bytes = num_bytes; 624 ref->num_bytes = num_bytes;
625 ref->ref_mod = 1;
626 ref->action = action;
627 ref->is_head = 0;
628 ref->in_tree = 1;
487 629
488 if (btrfs_delayed_ref_is_head(ref)) { 630 full_ref = btrfs_delayed_node_to_tree_ref(ref);
489 head_ref = btrfs_delayed_node_to_head(ref); 631 if (parent) {
490 head_ref->must_insert_reserved = must_insert_reserved; 632 full_ref->parent = parent;
491 INIT_LIST_HEAD(&head_ref->cluster); 633 ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
492 mutex_init(&head_ref->mutex);
493 } else { 634 } else {
494 full_ref = btrfs_delayed_node_to_ref(ref);
495 full_ref->root = ref_root; 635 full_ref->root = ref_root;
496 full_ref->generation = ref_generation; 636 ref->type = BTRFS_TREE_BLOCK_REF_KEY;
497 full_ref->owner_objectid = owner_objectid;
498 full_ref->pin = pin;
499 full_ref->action = action;
500 } 637 }
638 full_ref->level = level;
501 639
502 existing = tree_insert(&delayed_refs->root, bytenr, 640 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
503 parent, &ref->rb_node);
504 641
505 if (existing) { 642 if (existing) {
506 if (btrfs_delayed_ref_is_head(ref)) 643 update_existing_ref(trans, delayed_refs, existing, ref);
507 update_existing_head_ref(existing, ref); 644 /*
508 else 645 * we've updated the existing ref, free the newly
509 update_existing_ref(trans, delayed_refs, existing, ref); 646 * allocated ref
647 */
648 kfree(ref);
649 } else {
650 delayed_refs->num_entries++;
651 trans->delayed_ref_updates++;
652 }
653 return 0;
654}
655
656/*
657 * helper to insert a delayed data ref into the rbtree.
658 */
659static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
660 struct btrfs_delayed_ref_node *ref,
661 u64 bytenr, u64 num_bytes, u64 parent,
662 u64 ref_root, u64 owner, u64 offset,
663 int action)
664{
665 struct btrfs_delayed_ref_node *existing;
666 struct btrfs_delayed_data_ref *full_ref;
667 struct btrfs_delayed_ref_root *delayed_refs;
668
669 if (action == BTRFS_ADD_DELAYED_EXTENT)
670 action = BTRFS_ADD_DELAYED_REF;
671
672 delayed_refs = &trans->transaction->delayed_refs;
673
674 /* first set the basic ref node struct up */
675 atomic_set(&ref->refs, 1);
676 ref->bytenr = bytenr;
677 ref->num_bytes = num_bytes;
678 ref->ref_mod = 1;
679 ref->action = action;
680 ref->is_head = 0;
681 ref->in_tree = 1;
682
683 full_ref = btrfs_delayed_node_to_data_ref(ref);
684 if (parent) {
685 full_ref->parent = parent;
686 ref->type = BTRFS_SHARED_DATA_REF_KEY;
687 } else {
688 full_ref->root = ref_root;
689 ref->type = BTRFS_EXTENT_DATA_REF_KEY;
690 }
691 full_ref->objectid = owner;
692 full_ref->offset = offset;
510 693
694 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
695
696 if (existing) {
697 update_existing_ref(trans, delayed_refs, existing, ref);
511 /* 698 /*
512 * we've updated the existing ref, free the newly 699 * we've updated the existing ref, free the newly
513 * allocated ref 700 * allocated ref
514 */ 701 */
515 kfree(ref); 702 kfree(ref);
516 } else { 703 } else {
517 if (btrfs_delayed_ref_is_head(ref)) {
518 delayed_refs->num_heads++;
519 delayed_refs->num_heads_ready++;
520 }
521 delayed_refs->num_entries++; 704 delayed_refs->num_entries++;
522 trans->delayed_ref_updates++; 705 trans->delayed_ref_updates++;
523 } 706 }
@@ -525,37 +708,78 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
525} 708}
526 709
527/* 710/*
528 * add a delayed ref to the tree. This does all of the accounting required 711 * add a delayed tree ref. This does all of the accounting required
529 * to make sure the delayed ref is eventually processed before this 712 * to make sure the delayed ref is eventually processed before this
530 * transaction commits. 713 * transaction commits.
531 */ 714 */
532int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 715int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
533 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 716 u64 bytenr, u64 num_bytes, u64 parent,
534 u64 ref_generation, u64 owner_objectid, int action, 717 u64 ref_root, int level, int action,
535 int pin) 718 struct btrfs_delayed_extent_op *extent_op)
536{ 719{
537 struct btrfs_delayed_ref *ref; 720 struct btrfs_delayed_tree_ref *ref;
538 struct btrfs_delayed_ref_head *head_ref; 721 struct btrfs_delayed_ref_head *head_ref;
539 struct btrfs_delayed_ref_root *delayed_refs; 722 struct btrfs_delayed_ref_root *delayed_refs;
540 int ret; 723 int ret;
541 724
725 BUG_ON(extent_op && extent_op->is_data);
542 ref = kmalloc(sizeof(*ref), GFP_NOFS); 726 ref = kmalloc(sizeof(*ref), GFP_NOFS);
543 if (!ref) 727 if (!ref)
544 return -ENOMEM; 728 return -ENOMEM;
545 729
730 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
731 if (!head_ref) {
732 kfree(ref);
733 return -ENOMEM;
734 }
735
736 head_ref->extent_op = extent_op;
737
738 delayed_refs = &trans->transaction->delayed_refs;
739 spin_lock(&delayed_refs->lock);
740
546 /* 741 /*
547 * the parent = 0 case comes from cases where we don't actually 742 * insert both the head node and the new ref without dropping
548 * know the parent yet. It will get updated later via a add/drop 743 * the spin lock
549 * pair.
550 */ 744 */
551 if (parent == 0) 745 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
552 parent = bytenr; 746 action, 0);
747 BUG_ON(ret);
748
749 ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
750 parent, ref_root, level, action);
751 BUG_ON(ret);
752 spin_unlock(&delayed_refs->lock);
753 return 0;
754}
755
756/*
757 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
758 */
759int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
760 u64 bytenr, u64 num_bytes,
761 u64 parent, u64 ref_root,
762 u64 owner, u64 offset, int action,
763 struct btrfs_delayed_extent_op *extent_op)
764{
765 struct btrfs_delayed_data_ref *ref;
766 struct btrfs_delayed_ref_head *head_ref;
767 struct btrfs_delayed_ref_root *delayed_refs;
768 int ret;
769
770 BUG_ON(extent_op && !extent_op->is_data);
771 ref = kmalloc(sizeof(*ref), GFP_NOFS);
772 if (!ref)
773 return -ENOMEM;
553 774
554 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 775 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
555 if (!head_ref) { 776 if (!head_ref) {
556 kfree(ref); 777 kfree(ref);
557 return -ENOMEM; 778 return -ENOMEM;
558 } 779 }
780
781 head_ref->extent_op = extent_op;
782
559 delayed_refs = &trans->transaction->delayed_refs; 783 delayed_refs = &trans->transaction->delayed_refs;
560 spin_lock(&delayed_refs->lock); 784 spin_lock(&delayed_refs->lock);
561 785
@@ -563,14 +787,39 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
563 * insert both the head node and the new ref without dropping 787 * insert both the head node and the new ref without dropping
564 * the spin lock 788 * the spin lock
565 */ 789 */
566 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, 790 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
567 (u64)-1, 0, 0, 0, action, pin); 791 action, 1);
568 BUG_ON(ret); 792 BUG_ON(ret);
569 793
570 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, 794 ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
571 parent, ref_root, ref_generation, 795 parent, ref_root, owner, offset, action);
572 owner_objectid, action, pin); 796 BUG_ON(ret);
797 spin_unlock(&delayed_refs->lock);
798 return 0;
799}
800
801int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
802 u64 bytenr, u64 num_bytes,
803 struct btrfs_delayed_extent_op *extent_op)
804{
805 struct btrfs_delayed_ref_head *head_ref;
806 struct btrfs_delayed_ref_root *delayed_refs;
807 int ret;
808
809 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
810 if (!head_ref)
811 return -ENOMEM;
812
813 head_ref->extent_op = extent_op;
814
815 delayed_refs = &trans->transaction->delayed_refs;
816 spin_lock(&delayed_refs->lock);
817
818 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
819 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
820 extent_op->is_data);
573 BUG_ON(ret); 821 BUG_ON(ret);
822
574 spin_unlock(&delayed_refs->lock); 823 spin_unlock(&delayed_refs->lock);
575 return 0; 824 return 0;
576} 825}
@@ -587,7 +836,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
587 struct btrfs_delayed_ref_root *delayed_refs; 836 struct btrfs_delayed_ref_root *delayed_refs;
588 837
589 delayed_refs = &trans->transaction->delayed_refs; 838 delayed_refs = &trans->transaction->delayed_refs;
590 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 839 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
591 if (ref) 840 if (ref)
592 return btrfs_delayed_node_to_head(ref); 841 return btrfs_delayed_node_to_head(ref);
593 return NULL; 842 return NULL;
@@ -603,6 +852,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
603 * 852 *
604 * It is the same as doing a ref add and delete in two separate calls. 853 * It is the same as doing a ref add and delete in two separate calls.
605 */ 854 */
855#if 0
606int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 856int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
607 u64 bytenr, u64 num_bytes, u64 orig_parent, 857 u64 bytenr, u64 num_bytes, u64 orig_parent,
608 u64 parent, u64 orig_ref_root, u64 ref_root, 858 u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -666,3 +916,4 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
666 spin_unlock(&delayed_refs->lock); 916 spin_unlock(&delayed_refs->lock);
667 return 0; 917 return 0;
668} 918}
919#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 3bec2ff0b15c..f6fc67ddad36 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -30,9 +30,6 @@ struct btrfs_delayed_ref_node {
30 /* the starting bytenr of the extent */ 30 /* the starting bytenr of the extent */
31 u64 bytenr; 31 u64 bytenr;
32 32
33 /* the parent our backref will point to */
34 u64 parent;
35
36 /* the size of the extent */ 33 /* the size of the extent */
37 u64 num_bytes; 34 u64 num_bytes;
38 35
@@ -50,10 +47,21 @@ struct btrfs_delayed_ref_node {
50 */ 47 */
51 int ref_mod; 48 int ref_mod;
52 49
50 unsigned int action:8;
51 unsigned int type:8;
53 /* is this node still in the rbtree? */ 52 /* is this node still in the rbtree? */
53 unsigned int is_head:1;
54 unsigned int in_tree:1; 54 unsigned int in_tree:1;
55}; 55};
56 56
57struct btrfs_delayed_extent_op {
58 struct btrfs_disk_key key;
59 u64 flags_to_set;
60 unsigned int update_key:1;
61 unsigned int update_flags:1;
62 unsigned int is_data:1;
63};
64
57/* 65/*
58 * the head refs are used to hold a lock on a given extent, which allows us 66 * the head refs are used to hold a lock on a given extent, which allows us
59 * to make sure that only one process is running the delayed refs 67 * to make sure that only one process is running the delayed refs
@@ -71,6 +79,7 @@ struct btrfs_delayed_ref_head {
71 79
72 struct list_head cluster; 80 struct list_head cluster;
73 81
82 struct btrfs_delayed_extent_op *extent_op;
74 /* 83 /*
75 * when a new extent is allocated, it is just reserved in memory 84 * when a new extent is allocated, it is just reserved in memory
76 * The actual extent isn't inserted into the extent allocation tree 85 * The actual extent isn't inserted into the extent allocation tree
@@ -84,27 +93,26 @@ struct btrfs_delayed_ref_head {
84 * the free has happened. 93 * the free has happened.
85 */ 94 */
86 unsigned int must_insert_reserved:1; 95 unsigned int must_insert_reserved:1;
96 unsigned int is_data:1;
87}; 97};
88 98
89struct btrfs_delayed_ref { 99struct btrfs_delayed_tree_ref {
90 struct btrfs_delayed_ref_node node; 100 struct btrfs_delayed_ref_node node;
101 union {
102 u64 root;
103 u64 parent;
104 };
105 int level;
106};
91 107
92 /* the root objectid our ref will point to */ 108struct btrfs_delayed_data_ref {
93 u64 root; 109 struct btrfs_delayed_ref_node node;
94 110 union {
95 /* the generation for the backref */ 111 u64 root;
96 u64 generation; 112 u64 parent;
97 113 };
98 /* owner_objectid of the backref */ 114 u64 objectid;
99 u64 owner_objectid; 115 u64 offset;
100
101 /* operation done by this entry in the rbtree */
102 u8 action;
103
104 /* if pin == 1, when the extent is freed it will be pinned until
105 * transaction commit
106 */
107 unsigned int pin:1;
108}; 116};
109 117
110struct btrfs_delayed_ref_root { 118struct btrfs_delayed_ref_root {
@@ -143,17 +151,25 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
143 } 151 }
144} 152}
145 153
146int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 154int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
147 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 155 u64 bytenr, u64 num_bytes, u64 parent,
148 u64 ref_generation, u64 owner_objectid, int action, 156 u64 ref_root, int level, int action,
149 int pin); 157 struct btrfs_delayed_extent_op *extent_op);
158int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
159 u64 bytenr, u64 num_bytes,
160 u64 parent, u64 ref_root,
161 u64 owner, u64 offset, int action,
162 struct btrfs_delayed_extent_op *extent_op);
163int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
164 u64 bytenr, u64 num_bytes,
165 struct btrfs_delayed_extent_op *extent_op);
150 166
151struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
152btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
153int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
154int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
155 struct btrfs_root *root, u64 bytenr, 171 struct btrfs_root *root, u64 bytenr,
156 u64 num_bytes, u32 *refs); 172 u64 num_bytes, u64 *refs, u64 *flags);
157int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
158 u64 bytenr, u64 num_bytes, u64 orig_parent, 174 u64 bytenr, u64 num_bytes, u64 orig_parent,
159 u64 parent, u64 orig_ref_root, u64 ref_root, 175 u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -169,18 +185,24 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
169 */ 185 */
170static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) 186static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
171{ 187{
172 return node->parent == (u64)-1; 188 return node->is_head;
173} 189}
174 190
175/* 191/*
176 * helper functions to cast a node into its container 192 * helper functions to cast a node into its container
177 */ 193 */
178static inline struct btrfs_delayed_ref * 194static inline struct btrfs_delayed_tree_ref *
179btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node) 195btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
180{ 196{
181 WARN_ON(btrfs_delayed_ref_is_head(node)); 197 WARN_ON(btrfs_delayed_ref_is_head(node));
182 return container_of(node, struct btrfs_delayed_ref, node); 198 return container_of(node, struct btrfs_delayed_tree_ref, node);
199}
183 200
201static inline struct btrfs_delayed_data_ref *
202btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
203{
204 WARN_ON(btrfs_delayed_ref_is_head(node));
205 return container_of(node, struct btrfs_delayed_data_ref, node);
184} 206}
185 207
186static inline struct btrfs_delayed_ref_head * 208static inline struct btrfs_delayed_ref_head *
@@ -188,6 +210,5 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
188{ 210{
189 WARN_ON(!btrfs_delayed_ref_is_head(node)); 211 WARN_ON(!btrfs_delayed_ref_is_head(node));
190 return container_of(node, struct btrfs_delayed_ref_head, node); 212 return container_of(node, struct btrfs_delayed_ref_head, node);
191
192} 213}
193#endif 214#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b0ea0b80c23..0d50d49d990a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,8 +26,8 @@
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h>
29#include "compat.h" 30#include "compat.h"
30#include "crc32c.h"
31#include "ctree.h" 31#include "ctree.h"
32#include "disk-io.h" 32#include "disk-io.h"
33#include "transaction.h" 33#include "transaction.h"
@@ -36,7 +36,6 @@
36#include "print-tree.h" 36#include "print-tree.h"
37#include "async-thread.h" 37#include "async-thread.h"
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h"
40#include "tree-log.h" 39#include "tree-log.h"
41#include "free-space-cache.h" 40#include "free-space-cache.h"
42 41
@@ -172,7 +171,7 @@ out:
172 171
173u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) 172u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
174{ 173{
175 return btrfs_crc32c(seed, data, len); 174 return crc32c(seed, data, len);
176} 175}
177 176
178void btrfs_csum_final(u32 crc, char *result) 177void btrfs_csum_final(u32 crc, char *result)
@@ -884,7 +883,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
884{ 883{
885 root->node = NULL; 884 root->node = NULL;
886 root->commit_root = NULL; 885 root->commit_root = NULL;
887 root->ref_tree = NULL;
888 root->sectorsize = sectorsize; 886 root->sectorsize = sectorsize;
889 root->nodesize = nodesize; 887 root->nodesize = nodesize;
890 root->leafsize = leafsize; 888 root->leafsize = leafsize;
@@ -899,12 +897,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
899 root->last_inode_alloc = 0; 897 root->last_inode_alloc = 0;
900 root->name = NULL; 898 root->name = NULL;
901 root->in_sysfs = 0; 899 root->in_sysfs = 0;
900 root->inode_tree.rb_node = NULL;
902 901
903 INIT_LIST_HEAD(&root->dirty_list); 902 INIT_LIST_HEAD(&root->dirty_list);
904 INIT_LIST_HEAD(&root->orphan_list); 903 INIT_LIST_HEAD(&root->orphan_list);
905 INIT_LIST_HEAD(&root->dead_list); 904 INIT_LIST_HEAD(&root->root_list);
906 spin_lock_init(&root->node_lock); 905 spin_lock_init(&root->node_lock);
907 spin_lock_init(&root->list_lock); 906 spin_lock_init(&root->list_lock);
907 spin_lock_init(&root->inode_lock);
908 mutex_init(&root->objectid_mutex); 908 mutex_init(&root->objectid_mutex);
909 mutex_init(&root->log_mutex); 909 mutex_init(&root->log_mutex);
910 init_waitqueue_head(&root->log_writer_wait); 910 init_waitqueue_head(&root->log_writer_wait);
@@ -918,9 +918,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
918 extent_io_tree_init(&root->dirty_log_pages, 918 extent_io_tree_init(&root->dirty_log_pages,
919 fs_info->btree_inode->i_mapping, GFP_NOFS); 919 fs_info->btree_inode->i_mapping, GFP_NOFS);
920 920
921 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
922 root->ref_tree = &root->ref_tree_struct;
923
924 memset(&root->root_key, 0, sizeof(root->root_key)); 921 memset(&root->root_key, 0, sizeof(root->root_key));
925 memset(&root->root_item, 0, sizeof(root->root_item)); 922 memset(&root->root_item, 0, sizeof(root->root_item));
926 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); 923 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -959,6 +956,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
959 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 956 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
960 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 957 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
961 blocksize, generation); 958 blocksize, generation);
959 root->commit_root = btrfs_root_node(root);
962 BUG_ON(!root->node); 960 BUG_ON(!root->node);
963 return 0; 961 return 0;
964} 962}
@@ -1025,20 +1023,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1025 */ 1023 */
1026 root->ref_cows = 0; 1024 root->ref_cows = 0;
1027 1025
1028 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 1026 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1029 0, BTRFS_TREE_LOG_OBJECTID, 1027 BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
1030 trans->transid, 0, 0, 0);
1031 if (IS_ERR(leaf)) { 1028 if (IS_ERR(leaf)) {
1032 kfree(root); 1029 kfree(root);
1033 return ERR_CAST(leaf); 1030 return ERR_CAST(leaf);
1034 } 1031 }
1035 1032
1033 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1034 btrfs_set_header_bytenr(leaf, leaf->start);
1035 btrfs_set_header_generation(leaf, trans->transid);
1036 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1037 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
1036 root->node = leaf; 1038 root->node = leaf;
1037 btrfs_set_header_nritems(root->node, 0);
1038 btrfs_set_header_level(root->node, 0);
1039 btrfs_set_header_bytenr(root->node, root->node->start);
1040 btrfs_set_header_generation(root->node, trans->transid);
1041 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
1042 1039
1043 write_extent_buffer(root->node, root->fs_info->fsid, 1040 write_extent_buffer(root->node, root->fs_info->fsid,
1044 (unsigned long)btrfs_header_fsid(root->node), 1041 (unsigned long)btrfs_header_fsid(root->node),
@@ -1081,8 +1078,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1081 inode_item->nbytes = cpu_to_le64(root->leafsize); 1078 inode_item->nbytes = cpu_to_le64(root->leafsize);
1082 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 1079 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1083 1080
1084 btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); 1081 btrfs_set_root_node(&log_root->root_item, log_root->node);
1085 btrfs_set_root_generation(&log_root->root_item, trans->transid);
1086 1082
1087 WARN_ON(root->log_root); 1083 WARN_ON(root->log_root);
1088 root->log_root = log_root; 1084 root->log_root = log_root;
@@ -1144,6 +1140,7 @@ out:
1144 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1140 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1145 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1141 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1146 blocksize, generation); 1142 blocksize, generation);
1143 root->commit_root = btrfs_root_node(root);
1147 BUG_ON(!root->node); 1144 BUG_ON(!root->node);
1148insert: 1145insert:
1149 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1146 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1210,7 +1207,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1210 } 1207 }
1211 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 1208 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1212 ret = btrfs_find_dead_roots(fs_info->tree_root, 1209 ret = btrfs_find_dead_roots(fs_info->tree_root,
1213 root->root_key.objectid, root); 1210 root->root_key.objectid);
1214 BUG_ON(ret); 1211 BUG_ON(ret);
1215 btrfs_orphan_cleanup(root); 1212 btrfs_orphan_cleanup(root);
1216 } 1213 }
@@ -1569,8 +1566,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1569 atomic_set(&fs_info->async_delalloc_pages, 0); 1566 atomic_set(&fs_info->async_delalloc_pages, 0);
1570 atomic_set(&fs_info->async_submit_draining, 0); 1567 atomic_set(&fs_info->async_submit_draining, 0);
1571 atomic_set(&fs_info->nr_async_bios, 0); 1568 atomic_set(&fs_info->nr_async_bios, 0);
1572 atomic_set(&fs_info->throttles, 0);
1573 atomic_set(&fs_info->throttle_gen, 0);
1574 fs_info->sb = sb; 1569 fs_info->sb = sb;
1575 fs_info->max_extent = (u64)-1; 1570 fs_info->max_extent = (u64)-1;
1576 fs_info->max_inline = 8192 * 1024; 1571 fs_info->max_inline = 8192 * 1024;
@@ -1598,6 +1593,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1598 fs_info->btree_inode->i_mapping->a_ops = &btree_aops; 1593 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1599 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; 1594 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1600 1595
1596 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
1601 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, 1597 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1602 fs_info->btree_inode->i_mapping, 1598 fs_info->btree_inode->i_mapping,
1603 GFP_NOFS); 1599 GFP_NOFS);
@@ -1613,10 +1609,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1613 fs_info->btree_inode->i_mapping, GFP_NOFS); 1609 fs_info->btree_inode->i_mapping, GFP_NOFS);
1614 fs_info->do_barriers = 1; 1610 fs_info->do_barriers = 1;
1615 1611
1616 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1617 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1618 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1619
1620 BTRFS_I(fs_info->btree_inode)->root = tree_root; 1612 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1621 memset(&BTRFS_I(fs_info->btree_inode)->location, 0, 1613 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1622 sizeof(struct btrfs_key)); 1614 sizeof(struct btrfs_key));
@@ -1674,6 +1666,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1674 goto fail_iput; 1666 goto fail_iput;
1675 } 1667 }
1676 1668
1669 features = btrfs_super_incompat_flags(disk_super);
1670 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
1671 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1672 btrfs_set_super_incompat_flags(disk_super, features);
1673 }
1674
1677 features = btrfs_super_compat_ro_flags(disk_super) & 1675 features = btrfs_super_compat_ro_flags(disk_super) &
1678 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1676 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
1679 if (!(sb->s_flags & MS_RDONLY) && features) { 1677 if (!(sb->s_flags & MS_RDONLY) && features) {
@@ -1771,7 +1769,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1771 if (ret) { 1769 if (ret) {
1772 printk(KERN_WARNING "btrfs: failed to read the system " 1770 printk(KERN_WARNING "btrfs: failed to read the system "
1773 "array on %s\n", sb->s_id); 1771 "array on %s\n", sb->s_id);
1774 goto fail_sys_array; 1772 goto fail_sb_buffer;
1775 } 1773 }
1776 1774
1777 blocksize = btrfs_level_size(tree_root, 1775 blocksize = btrfs_level_size(tree_root,
@@ -1785,6 +1783,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1785 btrfs_super_chunk_root(disk_super), 1783 btrfs_super_chunk_root(disk_super),
1786 blocksize, generation); 1784 blocksize, generation);
1787 BUG_ON(!chunk_root->node); 1785 BUG_ON(!chunk_root->node);
1786 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1787 chunk_root->commit_root = btrfs_root_node(chunk_root);
1788 1788
1789 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, 1789 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1790 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 1790 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1810,7 +1810,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1810 blocksize, generation); 1810 blocksize, generation);
1811 if (!tree_root->node) 1811 if (!tree_root->node)
1812 goto fail_chunk_root; 1812 goto fail_chunk_root;
1813 1813 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1814 tree_root->commit_root = btrfs_root_node(tree_root);
1814 1815
1815 ret = find_and_setup_root(tree_root, fs_info, 1816 ret = find_and_setup_root(tree_root, fs_info,
1816 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 1817 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1820,14 +1821,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1820 1821
1821 ret = find_and_setup_root(tree_root, fs_info, 1822 ret = find_and_setup_root(tree_root, fs_info,
1822 BTRFS_DEV_TREE_OBJECTID, dev_root); 1823 BTRFS_DEV_TREE_OBJECTID, dev_root);
1823 dev_root->track_dirty = 1;
1824 if (ret) 1824 if (ret)
1825 goto fail_extent_root; 1825 goto fail_extent_root;
1826 dev_root->track_dirty = 1;
1826 1827
1827 ret = find_and_setup_root(tree_root, fs_info, 1828 ret = find_and_setup_root(tree_root, fs_info,
1828 BTRFS_CSUM_TREE_OBJECTID, csum_root); 1829 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1829 if (ret) 1830 if (ret)
1830 goto fail_extent_root; 1831 goto fail_dev_root;
1831 1832
1832 csum_root->track_dirty = 1; 1833 csum_root->track_dirty = 1;
1833 1834
@@ -1849,6 +1850,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1849 if (IS_ERR(fs_info->transaction_kthread)) 1850 if (IS_ERR(fs_info->transaction_kthread))
1850 goto fail_cleaner; 1851 goto fail_cleaner;
1851 1852
1853 if (!btrfs_test_opt(tree_root, SSD) &&
1854 !btrfs_test_opt(tree_root, NOSSD) &&
1855 !fs_info->fs_devices->rotating) {
1856 printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
1857 "mode\n");
1858 btrfs_set_opt(fs_info->mount_opt, SSD);
1859 }
1860
1852 if (btrfs_super_log_root(disk_super) != 0) { 1861 if (btrfs_super_log_root(disk_super) != 0) {
1853 u64 bytenr = btrfs_super_log_root(disk_super); 1862 u64 bytenr = btrfs_super_log_root(disk_super);
1854 1863
@@ -1881,7 +1890,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1881 } 1890 }
1882 1891
1883 if (!(sb->s_flags & MS_RDONLY)) { 1892 if (!(sb->s_flags & MS_RDONLY)) {
1884 ret = btrfs_cleanup_reloc_trees(tree_root); 1893 ret = btrfs_recover_relocation(tree_root);
1885 BUG_ON(ret); 1894 BUG_ON(ret);
1886 } 1895 }
1887 1896
@@ -1892,6 +1901,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1892 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 1901 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1893 if (!fs_info->fs_root) 1902 if (!fs_info->fs_root)
1894 goto fail_trans_kthread; 1903 goto fail_trans_kthread;
1904
1895 return tree_root; 1905 return tree_root;
1896 1906
1897fail_trans_kthread: 1907fail_trans_kthread:
@@ -1908,14 +1918,19 @@ fail_cleaner:
1908 1918
1909fail_csum_root: 1919fail_csum_root:
1910 free_extent_buffer(csum_root->node); 1920 free_extent_buffer(csum_root->node);
1921 free_extent_buffer(csum_root->commit_root);
1922fail_dev_root:
1923 free_extent_buffer(dev_root->node);
1924 free_extent_buffer(dev_root->commit_root);
1911fail_extent_root: 1925fail_extent_root:
1912 free_extent_buffer(extent_root->node); 1926 free_extent_buffer(extent_root->node);
1927 free_extent_buffer(extent_root->commit_root);
1913fail_tree_root: 1928fail_tree_root:
1914 free_extent_buffer(tree_root->node); 1929 free_extent_buffer(tree_root->node);
1930 free_extent_buffer(tree_root->commit_root);
1915fail_chunk_root: 1931fail_chunk_root:
1916 free_extent_buffer(chunk_root->node); 1932 free_extent_buffer(chunk_root->node);
1917fail_sys_array: 1933 free_extent_buffer(chunk_root->commit_root);
1918 free_extent_buffer(dev_root->node);
1919fail_sb_buffer: 1934fail_sb_buffer:
1920 btrfs_stop_workers(&fs_info->fixup_workers); 1935 btrfs_stop_workers(&fs_info->fixup_workers);
1921 btrfs_stop_workers(&fs_info->delalloc_workers); 1936 btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -2005,6 +2020,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
2005 return latest; 2020 return latest;
2006} 2021}
2007 2022
2023/*
2024 * this should be called twice, once with wait == 0 and
2025 * once with wait == 1. When wait == 0 is done, all the buffer heads
2026 * we write are pinned.
2027 *
2028 * They are released when wait == 1 is done.
2029 * max_mirrors must be the same for both runs, and it indicates how
2030 * many supers on this one device should be written.
2031 *
2032 * max_mirrors == 0 means to write them all.
2033 */
2008static int write_dev_supers(struct btrfs_device *device, 2034static int write_dev_supers(struct btrfs_device *device,
2009 struct btrfs_super_block *sb, 2035 struct btrfs_super_block *sb,
2010 int do_barriers, int wait, int max_mirrors) 2036 int do_barriers, int wait, int max_mirrors)
@@ -2040,12 +2066,16 @@ static int write_dev_supers(struct btrfs_device *device,
2040 bh = __find_get_block(device->bdev, bytenr / 4096, 2066 bh = __find_get_block(device->bdev, bytenr / 4096,
2041 BTRFS_SUPER_INFO_SIZE); 2067 BTRFS_SUPER_INFO_SIZE);
2042 BUG_ON(!bh); 2068 BUG_ON(!bh);
2043 brelse(bh);
2044 wait_on_buffer(bh); 2069 wait_on_buffer(bh);
2045 if (buffer_uptodate(bh)) { 2070 if (!buffer_uptodate(bh))
2046 brelse(bh); 2071 errors++;
2047 continue; 2072
2048 } 2073 /* drop our reference */
2074 brelse(bh);
2075
2076 /* drop the reference from the wait == 0 run */
2077 brelse(bh);
2078 continue;
2049 } else { 2079 } else {
2050 btrfs_set_super_bytenr(sb, bytenr); 2080 btrfs_set_super_bytenr(sb, bytenr);
2051 2081
@@ -2056,12 +2086,18 @@ static int write_dev_supers(struct btrfs_device *device,
2056 BTRFS_CSUM_SIZE); 2086 BTRFS_CSUM_SIZE);
2057 btrfs_csum_final(crc, sb->csum); 2087 btrfs_csum_final(crc, sb->csum);
2058 2088
2089 /*
2090 * one reference for us, and we leave it for the
2091 * caller
2092 */
2059 bh = __getblk(device->bdev, bytenr / 4096, 2093 bh = __getblk(device->bdev, bytenr / 4096,
2060 BTRFS_SUPER_INFO_SIZE); 2094 BTRFS_SUPER_INFO_SIZE);
2061 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); 2095 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
2062 2096
2063 set_buffer_uptodate(bh); 2097 /* one reference for submit_bh */
2064 get_bh(bh); 2098 get_bh(bh);
2099
2100 set_buffer_uptodate(bh);
2065 lock_buffer(bh); 2101 lock_buffer(bh);
2066 bh->b_end_io = btrfs_end_buffer_write_sync; 2102 bh->b_end_io = btrfs_end_buffer_write_sync;
2067 } 2103 }
@@ -2073,6 +2109,7 @@ static int write_dev_supers(struct btrfs_device *device,
2073 device->name); 2109 device->name);
2074 set_buffer_uptodate(bh); 2110 set_buffer_uptodate(bh);
2075 device->barriers = 0; 2111 device->barriers = 0;
2112 /* one reference for submit_bh */
2076 get_bh(bh); 2113 get_bh(bh);
2077 lock_buffer(bh); 2114 lock_buffer(bh);
2078 ret = submit_bh(WRITE_SYNC, bh); 2115 ret = submit_bh(WRITE_SYNC, bh);
@@ -2081,22 +2118,15 @@ static int write_dev_supers(struct btrfs_device *device,
2081 ret = submit_bh(WRITE_SYNC, bh); 2118 ret = submit_bh(WRITE_SYNC, bh);
2082 } 2119 }
2083 2120
2084 if (!ret && wait) { 2121 if (ret)
2085 wait_on_buffer(bh);
2086 if (!buffer_uptodate(bh))
2087 errors++;
2088 } else if (ret) {
2089 errors++; 2122 errors++;
2090 }
2091 if (wait)
2092 brelse(bh);
2093 } 2123 }
2094 return errors < i ? 0 : -1; 2124 return errors < i ? 0 : -1;
2095} 2125}
2096 2126
2097int write_all_supers(struct btrfs_root *root, int max_mirrors) 2127int write_all_supers(struct btrfs_root *root, int max_mirrors)
2098{ 2128{
2099 struct list_head *head = &root->fs_info->fs_devices->devices; 2129 struct list_head *head;
2100 struct btrfs_device *dev; 2130 struct btrfs_device *dev;
2101 struct btrfs_super_block *sb; 2131 struct btrfs_super_block *sb;
2102 struct btrfs_dev_item *dev_item; 2132 struct btrfs_dev_item *dev_item;
@@ -2111,6 +2141,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2111 2141
2112 sb = &root->fs_info->super_for_commit; 2142 sb = &root->fs_info->super_for_commit;
2113 dev_item = &sb->dev_item; 2143 dev_item = &sb->dev_item;
2144
2145 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2146 head = &root->fs_info->fs_devices->devices;
2114 list_for_each_entry(dev, head, dev_list) { 2147 list_for_each_entry(dev, head, dev_list) {
2115 if (!dev->bdev) { 2148 if (!dev->bdev) {
2116 total_errors++; 2149 total_errors++;
@@ -2154,6 +2187,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2154 if (ret) 2187 if (ret)
2155 total_errors++; 2188 total_errors++;
2156 } 2189 }
2190 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2157 if (total_errors > max_errors) { 2191 if (total_errors > max_errors) {
2158 printk(KERN_ERR "btrfs: %d errors while writing supers\n", 2192 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2159 total_errors); 2193 total_errors);
@@ -2173,6 +2207,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
2173 2207
2174int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2208int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2175{ 2209{
2210 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2176 radix_tree_delete(&fs_info->fs_roots_radix, 2211 radix_tree_delete(&fs_info->fs_roots_radix,
2177 (unsigned long)root->root_key.objectid); 2212 (unsigned long)root->root_key.objectid);
2178 if (root->anon_super.s_dev) { 2213 if (root->anon_super.s_dev) {
@@ -2219,10 +2254,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2219 ARRAY_SIZE(gang)); 2254 ARRAY_SIZE(gang));
2220 if (!ret) 2255 if (!ret)
2221 break; 2256 break;
2257
2258 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2222 for (i = 0; i < ret; i++) { 2259 for (i = 0; i < ret; i++) {
2223 root_objectid = gang[i]->root_key.objectid; 2260 root_objectid = gang[i]->root_key.objectid;
2224 ret = btrfs_find_dead_roots(fs_info->tree_root, 2261 ret = btrfs_find_dead_roots(fs_info->tree_root,
2225 root_objectid, gang[i]); 2262 root_objectid);
2226 BUG_ON(ret); 2263 BUG_ON(ret);
2227 btrfs_orphan_cleanup(gang[i]); 2264 btrfs_orphan_cleanup(gang[i]);
2228 } 2265 }
@@ -2278,20 +2315,16 @@ int close_ctree(struct btrfs_root *root)
2278 (unsigned long long)fs_info->total_ref_cache_size); 2315 (unsigned long long)fs_info->total_ref_cache_size);
2279 } 2316 }
2280 2317
2281 if (fs_info->extent_root->node) 2318 free_extent_buffer(fs_info->extent_root->node);
2282 free_extent_buffer(fs_info->extent_root->node); 2319 free_extent_buffer(fs_info->extent_root->commit_root);
2283 2320 free_extent_buffer(fs_info->tree_root->node);
2284 if (fs_info->tree_root->node) 2321 free_extent_buffer(fs_info->tree_root->commit_root);
2285 free_extent_buffer(fs_info->tree_root->node); 2322 free_extent_buffer(root->fs_info->chunk_root->node);
2286 2323 free_extent_buffer(root->fs_info->chunk_root->commit_root);
2287 if (root->fs_info->chunk_root->node) 2324 free_extent_buffer(root->fs_info->dev_root->node);
2288 free_extent_buffer(root->fs_info->chunk_root->node); 2325 free_extent_buffer(root->fs_info->dev_root->commit_root);
2289 2326 free_extent_buffer(root->fs_info->csum_root->node);
2290 if (root->fs_info->dev_root->node) 2327 free_extent_buffer(root->fs_info->csum_root->commit_root);
2291 free_extent_buffer(root->fs_info->dev_root->node);
2292
2293 if (root->fs_info->csum_root->node)
2294 free_extent_buffer(root->fs_info->csum_root->node);
2295 2328
2296 btrfs_free_block_groups(root->fs_info); 2329 btrfs_free_block_groups(root->fs_info);
2297 2330
@@ -2373,17 +2406,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2373 * looks as though older kernels can get into trouble with 2406 * looks as though older kernels can get into trouble with
2374 * this code, they end up stuck in balance_dirty_pages forever 2407 * this code, they end up stuck in balance_dirty_pages forever
2375 */ 2408 */
2376 struct extent_io_tree *tree;
2377 u64 num_dirty; 2409 u64 num_dirty;
2378 u64 start = 0;
2379 unsigned long thresh = 32 * 1024 * 1024; 2410 unsigned long thresh = 32 * 1024 * 1024;
2380 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2381 2411
2382 if (current->flags & PF_MEMALLOC) 2412 if (current->flags & PF_MEMALLOC)
2383 return; 2413 return;
2384 2414
2385 num_dirty = count_range_bits(tree, &start, (u64)-1, 2415 num_dirty = root->fs_info->dirty_metadata_bytes;
2386 thresh, EXTENT_DIRTY); 2416
2387 if (num_dirty > thresh) { 2417 if (num_dirty > thresh) {
2388 balance_dirty_pages_ratelimited_nr( 2418 balance_dirty_pages_ratelimited_nr(
2389 root->fs_info->btree_inode->i_mapping, 1); 2419 root->fs_info->btree_inode->i_mapping, 1);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 85315d2c90de..9596b40caa4e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
79 key.offset = 0; 79 key.offset = 0;
80 80
81 inode = btrfs_iget(sb, &key, root, NULL); 81 inode = btrfs_iget(sb, &key, root);
82 if (IS_ERR(inode)) 82 if (IS_ERR(inode))
83 return (void *)inode; 83 return (void *)inode;
84 84
@@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
193 key.offset = 0; 193 key.offset = 0;
194 194
195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); 195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
196} 196}
197 197
198const struct export_operations btrfs_export_ops = { 198const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 35af93355063..edc7d208c5ce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,50 +23,39 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include "compat.h" 24#include "compat.h"
25#include "hash.h" 25#include "hash.h"
26#include "crc32c.h"
27#include "ctree.h" 26#include "ctree.h"
28#include "disk-io.h" 27#include "disk-io.h"
29#include "print-tree.h" 28#include "print-tree.h"
30#include "transaction.h" 29#include "transaction.h"
31#include "volumes.h" 30#include "volumes.h"
32#include "locking.h" 31#include "locking.h"
33#include "ref-cache.h"
34#include "free-space-cache.h" 32#include "free-space-cache.h"
35 33
36#define PENDING_EXTENT_INSERT 0
37#define PENDING_EXTENT_DELETE 1
38#define PENDING_BACKREF_UPDATE 2
39
40struct pending_extent_op {
41 int type;
42 u64 bytenr;
43 u64 num_bytes;
44 u64 parent;
45 u64 orig_parent;
46 u64 generation;
47 u64 orig_generation;
48 int level;
49 struct list_head list;
50 int del;
51};
52
53static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, u64 parent,
55 u64 root_objectid, u64 ref_generation,
56 u64 owner, struct btrfs_key *ins,
57 int ref_mod);
58static int update_reserved_extents(struct btrfs_root *root, 34static int update_reserved_extents(struct btrfs_root *root,
59 u64 bytenr, u64 num, int reserve); 35 u64 bytenr, u64 num, int reserve);
60static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, 37 struct btrfs_root *root,
62 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc,
63 int mark_free); 39 int mark_free);
64static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans, 40static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root, 41 struct btrfs_root *root,
66 u64 bytenr, u64 num_bytes, u64 parent, 42 u64 bytenr, u64 num_bytes, u64 parent,
67 u64 root_objectid, u64 ref_generation, 43 u64 root_objectid, u64 owner_objectid,
68 u64 owner_objectid, int pin, 44 u64 owner_offset, int refs_to_drop,
69 int ref_to_drop); 45 struct btrfs_delayed_extent_op *extra_op);
46static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
47 struct extent_buffer *leaf,
48 struct btrfs_extent_item *ei);
49static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
50 struct btrfs_root *root,
51 u64 parent, u64 root_objectid,
52 u64 flags, u64 owner, u64 offset,
53 struct btrfs_key *ins, int ref_mod);
54static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
55 struct btrfs_root *root,
56 u64 parent, u64 root_objectid,
57 u64 flags, struct btrfs_disk_key *key,
58 int level, struct btrfs_key *ins);
70 59
71static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
72 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -453,199 +442,969 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
453 * maintenance. This is actually the same as #2, but with a slightly 442 * maintenance. This is actually the same as #2, but with a slightly
454 * different use case. 443 * different use case.
455 * 444 *
445 * There are two kinds of back refs. The implicit back refs is optimized
446 * for pointers in non-shared tree blocks. For a given pointer in a block,
447 * back refs of this kind provide information about the block's owner tree
448 * and the pointer's key. These information allow us to find the block by
449 * b-tree searching. The full back refs is for pointers in tree blocks not
450 * referenced by their owner trees. The location of tree block is recorded
451 * in the back refs. Actually the full back refs is generic, and can be
452 * used in all cases the implicit back refs is used. The major shortcoming
453 * of the full back refs is its overhead. Every time a tree block gets
454 * COWed, we have to update back refs entry for all pointers in it.
455 *
456 * For a newly allocated tree block, we use implicit back refs for
457 * pointers in it. This means most tree related operations only involve
458 * implicit back refs. For a tree block created in old transaction, the
459 * only way to drop a reference to it is COW it. So we can detect the
460 * event that tree block loses its owner tree's reference and do the
461 * back refs conversion.
462 *
463 * When a tree block is COW'd through a tree, there are four cases:
464 *
465 * The reference count of the block is one and the tree is the block's
466 * owner tree. Nothing to do in this case.
467 *
468 * The reference count of the block is one and the tree is not the
469 * block's owner tree. In this case, full back refs is used for pointers
470 * in the block. Remove these full back refs, add implicit back refs for
471 * every pointers in the new block.
472 *
473 * The reference count of the block is greater than one and the tree is
474 * the block's owner tree. In this case, implicit back refs is used for
475 * pointers in the block. Add full back refs for every pointers in the
476 * block, increase lower level extents' reference counts. The original
477 * implicit back refs are entailed to the new block.
478 *
479 * The reference count of the block is greater than one and the tree is
480 * not the block's owner tree. Add implicit back refs for every pointer in
481 * the new block, increase lower level extents' reference count.
482 *
483 * Back Reference Key composing:
484 *
485 * The key objectid corresponds to the first byte in the extent,
486 * The key type is used to differentiate between types of back refs.
487 * There are different meanings of the key offset for different types
488 * of back refs.
489 *
456 * File extents can be referenced by: 490 * File extents can be referenced by:
457 * 491 *
458 * - multiple snapshots, subvolumes, or different generations in one subvol 492 * - multiple snapshots, subvolumes, or different generations in one subvol
459 * - different files inside a single subvolume 493 * - different files inside a single subvolume
460 * - different offsets inside a file (bookend extents in file.c) 494 * - different offsets inside a file (bookend extents in file.c)
461 * 495 *
462 * The extent ref structure has fields for: 496 * The extent ref structure for the implicit back refs has fields for:
463 * 497 *
464 * - Objectid of the subvolume root 498 * - Objectid of the subvolume root
465 * - Generation number of the tree holding the reference
466 * - objectid of the file holding the reference 499 * - objectid of the file holding the reference
467 * - number of references holding by parent node (alway 1 for tree blocks) 500 * - original offset in the file
468 * 501 * - how many bookend extents
469 * Btree leaf may hold multiple references to a file extent. In most cases,
470 * these references are from same file and the corresponding offsets inside
471 * the file are close together.
472 *
473 * When a file extent is allocated the fields are filled in:
474 * (root_key.objectid, trans->transid, inode objectid, 1)
475 * 502 *
476 * When a leaf is cow'd new references are added for every file extent found 503 * The key offset for the implicit back refs is hash of the first
477 * in the leaf. It looks similar to the create case, but trans->transid will 504 * three fields.
478 * be different when the block is cow'd.
479 * 505 *
480 * (root_key.objectid, trans->transid, inode objectid, 506 * The extent ref structure for the full back refs has field for:
481 * number of references in the leaf)
482 * 507 *
483 * When a file extent is removed either during snapshot deletion or 508 * - number of pointers in the tree leaf
484 * file truncation, we find the corresponding back reference and check
485 * the following fields:
486 * 509 *
487 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), 510 * The key offset for the implicit back refs is the first byte of
488 * inode objectid) 511 * the tree leaf
489 * 512 *
490 * Btree extents can be referenced by: 513 * When a file extent is allocated, The implicit back refs is used.
491 * 514 * the fields are filled in:
492 * - Different subvolumes
493 * - Different generations of the same subvolume
494 *
495 * When a tree block is created, back references are inserted:
496 * 515 *
497 * (root->root_key.objectid, trans->transid, level, 1) 516 * (root_key.objectid, inode objectid, offset in file, 1)
498 * 517 *
499 * When a tree block is cow'd, new back references are added for all the 518 * When a file extent is removed file truncation, we find the
500 * blocks it points to. If the tree block isn't in reference counted root, 519 * corresponding implicit back refs and check the following fields:
501 * the old back references are removed. These new back references are of
502 * the form (trans->transid will have increased since creation):
503 * 520 *
504 * (root->root_key.objectid, trans->transid, level, 1) 521 * (btrfs_header_owner(leaf), inode objectid, offset in file)
505 * 522 *
506 * When a backref is in deleting, the following fields are checked: 523 * Btree extents can be referenced by:
507 * 524 *
508 * if backref was for a tree root: 525 * - Different subvolumes
509 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
510 * else
511 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
512 * 526 *
513 * Back Reference Key composing: 527 * Both the implicit back refs and the full back refs for tree blocks
528 * only consist of key. The key offset for the implicit back refs is
529 * objectid of block's owner tree. The key offset for the full back refs
530 * is the first byte of parent block.
514 * 531 *
515 * The key objectid corresponds to the first byte in the extent, the key 532 * When implicit back refs is used, information about the lowest key and
516 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first 533 * level of the tree block are required. These information are stored in
517 * byte of parent extent. If a extent is tree root, the key offset is set 534 * tree block info structure.
518 * to the key objectid.
519 */ 535 */
520 536
521static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, 537#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
522 struct btrfs_root *root, 538static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
523 struct btrfs_path *path, 539 struct btrfs_root *root,
524 u64 bytenr, u64 parent, 540 struct btrfs_path *path,
525 u64 ref_root, u64 ref_generation, 541 u64 owner, u32 extra_size)
526 u64 owner_objectid, int del)
527{ 542{
543 struct btrfs_extent_item *item;
544 struct btrfs_extent_item_v0 *ei0;
545 struct btrfs_extent_ref_v0 *ref0;
546 struct btrfs_tree_block_info *bi;
547 struct extent_buffer *leaf;
528 struct btrfs_key key; 548 struct btrfs_key key;
529 struct btrfs_extent_ref *ref; 549 struct btrfs_key found_key;
550 u32 new_size = sizeof(*item);
551 u64 refs;
552 int ret;
553
554 leaf = path->nodes[0];
555 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
556
557 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
558 ei0 = btrfs_item_ptr(leaf, path->slots[0],
559 struct btrfs_extent_item_v0);
560 refs = btrfs_extent_refs_v0(leaf, ei0);
561
562 if (owner == (u64)-1) {
563 while (1) {
564 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
565 ret = btrfs_next_leaf(root, path);
566 if (ret < 0)
567 return ret;
568 BUG_ON(ret > 0);
569 leaf = path->nodes[0];
570 }
571 btrfs_item_key_to_cpu(leaf, &found_key,
572 path->slots[0]);
573 BUG_ON(key.objectid != found_key.objectid);
574 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
575 path->slots[0]++;
576 continue;
577 }
578 ref0 = btrfs_item_ptr(leaf, path->slots[0],
579 struct btrfs_extent_ref_v0);
580 owner = btrfs_ref_objectid_v0(leaf, ref0);
581 break;
582 }
583 }
584 btrfs_release_path(root, path);
585
586 if (owner < BTRFS_FIRST_FREE_OBJECTID)
587 new_size += sizeof(*bi);
588
589 new_size -= sizeof(*ei0);
590 ret = btrfs_search_slot(trans, root, &key, path,
591 new_size + extra_size, 1);
592 if (ret < 0)
593 return ret;
594 BUG_ON(ret);
595
596 ret = btrfs_extend_item(trans, root, path, new_size);
597 BUG_ON(ret);
598
599 leaf = path->nodes[0];
600 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
601 btrfs_set_extent_refs(leaf, item, refs);
602 /* FIXME: get real generation */
603 btrfs_set_extent_generation(leaf, item, 0);
604 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
605 btrfs_set_extent_flags(leaf, item,
606 BTRFS_EXTENT_FLAG_TREE_BLOCK |
607 BTRFS_BLOCK_FLAG_FULL_BACKREF);
608 bi = (struct btrfs_tree_block_info *)(item + 1);
609 /* FIXME: get first key of the block */
610 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
611 btrfs_set_tree_block_level(leaf, bi, (int)owner);
612 } else {
613 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
614 }
615 btrfs_mark_buffer_dirty(leaf);
616 return 0;
617}
618#endif
619
620static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
621{
622 u32 high_crc = ~(u32)0;
623 u32 low_crc = ~(u32)0;
624 __le64 lenum;
625
626 lenum = cpu_to_le64(root_objectid);
627 high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
628 lenum = cpu_to_le64(owner);
629 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
630 lenum = cpu_to_le64(offset);
631 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
632
633 return ((u64)high_crc << 31) ^ (u64)low_crc;
634}
635
636static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
637 struct btrfs_extent_data_ref *ref)
638{
639 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
640 btrfs_extent_data_ref_objectid(leaf, ref),
641 btrfs_extent_data_ref_offset(leaf, ref));
642}
643
644static int match_extent_data_ref(struct extent_buffer *leaf,
645 struct btrfs_extent_data_ref *ref,
646 u64 root_objectid, u64 owner, u64 offset)
647{
648 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
649 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
650 btrfs_extent_data_ref_offset(leaf, ref) != offset)
651 return 0;
652 return 1;
653}
654
655static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
656 struct btrfs_root *root,
657 struct btrfs_path *path,
658 u64 bytenr, u64 parent,
659 u64 root_objectid,
660 u64 owner, u64 offset)
661{
662 struct btrfs_key key;
663 struct btrfs_extent_data_ref *ref;
530 struct extent_buffer *leaf; 664 struct extent_buffer *leaf;
531 u64 ref_objectid; 665 u32 nritems;
532 int ret; 666 int ret;
667 int recow;
668 int err = -ENOENT;
533 669
534 key.objectid = bytenr; 670 key.objectid = bytenr;
535 key.type = BTRFS_EXTENT_REF_KEY; 671 if (parent) {
536 key.offset = parent; 672 key.type = BTRFS_SHARED_DATA_REF_KEY;
673 key.offset = parent;
674 } else {
675 key.type = BTRFS_EXTENT_DATA_REF_KEY;
676 key.offset = hash_extent_data_ref(root_objectid,
677 owner, offset);
678 }
679again:
680 recow = 0;
681 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
682 if (ret < 0) {
683 err = ret;
684 goto fail;
685 }
537 686
538 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); 687 if (parent) {
539 if (ret < 0) 688 if (!ret)
540 goto out; 689 return 0;
541 if (ret > 0) { 690#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
542 ret = -ENOENT; 691 key.type = BTRFS_EXTENT_REF_V0_KEY;
543 goto out; 692 btrfs_release_path(root, path);
693 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
694 if (ret < 0) {
695 err = ret;
696 goto fail;
697 }
698 if (!ret)
699 return 0;
700#endif
701 goto fail;
544 } 702 }
545 703
546 leaf = path->nodes[0]; 704 leaf = path->nodes[0];
547 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 705 nritems = btrfs_header_nritems(leaf);
548 ref_objectid = btrfs_ref_objectid(leaf, ref); 706 while (1) {
549 if (btrfs_ref_root(leaf, ref) != ref_root || 707 if (path->slots[0] >= nritems) {
550 btrfs_ref_generation(leaf, ref) != ref_generation || 708 ret = btrfs_next_leaf(root, path);
551 (ref_objectid != owner_objectid && 709 if (ret < 0)
552 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { 710 err = ret;
553 ret = -EIO; 711 if (ret)
554 WARN_ON(1); 712 goto fail;
555 goto out; 713
714 leaf = path->nodes[0];
715 nritems = btrfs_header_nritems(leaf);
716 recow = 1;
717 }
718
719 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
720 if (key.objectid != bytenr ||
721 key.type != BTRFS_EXTENT_DATA_REF_KEY)
722 goto fail;
723
724 ref = btrfs_item_ptr(leaf, path->slots[0],
725 struct btrfs_extent_data_ref);
726
727 if (match_extent_data_ref(leaf, ref, root_objectid,
728 owner, offset)) {
729 if (recow) {
730 btrfs_release_path(root, path);
731 goto again;
732 }
733 err = 0;
734 break;
735 }
736 path->slots[0]++;
556 } 737 }
557 ret = 0; 738fail:
558out: 739 return err;
559 return ret;
560} 740}
561 741
562static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, 742static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
563 struct btrfs_root *root, 743 struct btrfs_root *root,
564 struct btrfs_path *path, 744 struct btrfs_path *path,
565 u64 bytenr, u64 parent, 745 u64 bytenr, u64 parent,
566 u64 ref_root, u64 ref_generation, 746 u64 root_objectid, u64 owner,
567 u64 owner_objectid, 747 u64 offset, int refs_to_add)
568 int refs_to_add)
569{ 748{
570 struct btrfs_key key; 749 struct btrfs_key key;
571 struct extent_buffer *leaf; 750 struct extent_buffer *leaf;
572 struct btrfs_extent_ref *ref; 751 u32 size;
573 u32 num_refs; 752 u32 num_refs;
574 int ret; 753 int ret;
575 754
576 key.objectid = bytenr; 755 key.objectid = bytenr;
577 key.type = BTRFS_EXTENT_REF_KEY; 756 if (parent) {
578 key.offset = parent; 757 key.type = BTRFS_SHARED_DATA_REF_KEY;
758 key.offset = parent;
759 size = sizeof(struct btrfs_shared_data_ref);
760 } else {
761 key.type = BTRFS_EXTENT_DATA_REF_KEY;
762 key.offset = hash_extent_data_ref(root_objectid,
763 owner, offset);
764 size = sizeof(struct btrfs_extent_data_ref);
765 }
579 766
580 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); 767 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
581 if (ret == 0) { 768 if (ret && ret != -EEXIST)
582 leaf = path->nodes[0]; 769 goto fail;
583 ref = btrfs_item_ptr(leaf, path->slots[0], 770
584 struct btrfs_extent_ref); 771 leaf = path->nodes[0];
585 btrfs_set_ref_root(leaf, ref, ref_root); 772 if (parent) {
586 btrfs_set_ref_generation(leaf, ref, ref_generation); 773 struct btrfs_shared_data_ref *ref;
587 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
588 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
589 } else if (ret == -EEXIST) {
590 u64 existing_owner;
591
592 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
593 leaf = path->nodes[0];
594 ref = btrfs_item_ptr(leaf, path->slots[0], 774 ref = btrfs_item_ptr(leaf, path->slots[0],
595 struct btrfs_extent_ref); 775 struct btrfs_shared_data_ref);
596 if (btrfs_ref_root(leaf, ref) != ref_root || 776 if (ret == 0) {
597 btrfs_ref_generation(leaf, ref) != ref_generation) { 777 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
598 ret = -EIO; 778 } else {
599 WARN_ON(1); 779 num_refs = btrfs_shared_data_ref_count(leaf, ref);
600 goto out; 780 num_refs += refs_to_add;
781 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
601 } 782 }
783 } else {
784 struct btrfs_extent_data_ref *ref;
785 while (ret == -EEXIST) {
786 ref = btrfs_item_ptr(leaf, path->slots[0],
787 struct btrfs_extent_data_ref);
788 if (match_extent_data_ref(leaf, ref, root_objectid,
789 owner, offset))
790 break;
791 btrfs_release_path(root, path);
792 key.offset++;
793 ret = btrfs_insert_empty_item(trans, root, path, &key,
794 size);
795 if (ret && ret != -EEXIST)
796 goto fail;
602 797
603 num_refs = btrfs_ref_num_refs(leaf, ref); 798 leaf = path->nodes[0];
604 BUG_ON(num_refs == 0); 799 }
605 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add); 800 ref = btrfs_item_ptr(leaf, path->slots[0],
606 801 struct btrfs_extent_data_ref);
607 existing_owner = btrfs_ref_objectid(leaf, ref); 802 if (ret == 0) {
608 if (existing_owner != owner_objectid && 803 btrfs_set_extent_data_ref_root(leaf, ref,
609 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { 804 root_objectid);
610 btrfs_set_ref_objectid(leaf, ref, 805 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
611 BTRFS_MULTIPLE_OBJECTIDS); 806 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
807 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
808 } else {
809 num_refs = btrfs_extent_data_ref_count(leaf, ref);
810 num_refs += refs_to_add;
811 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
612 } 812 }
613 ret = 0;
614 } else {
615 goto out;
616 } 813 }
617 btrfs_unlock_up_safe(path, 1); 814 btrfs_mark_buffer_dirty(leaf);
618 btrfs_mark_buffer_dirty(path->nodes[0]); 815 ret = 0;
619out: 816fail:
620 btrfs_release_path(root, path); 817 btrfs_release_path(root, path);
621 return ret; 818 return ret;
622} 819}
623 820
624static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, 821static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
625 struct btrfs_root *root, 822 struct btrfs_root *root,
626 struct btrfs_path *path, 823 struct btrfs_path *path,
627 int refs_to_drop) 824 int refs_to_drop)
628{ 825{
826 struct btrfs_key key;
827 struct btrfs_extent_data_ref *ref1 = NULL;
828 struct btrfs_shared_data_ref *ref2 = NULL;
629 struct extent_buffer *leaf; 829 struct extent_buffer *leaf;
630 struct btrfs_extent_ref *ref; 830 u32 num_refs = 0;
631 u32 num_refs;
632 int ret = 0; 831 int ret = 0;
633 832
634 leaf = path->nodes[0]; 833 leaf = path->nodes[0];
635 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 834 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
636 num_refs = btrfs_ref_num_refs(leaf, ref); 835
836 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
837 ref1 = btrfs_item_ptr(leaf, path->slots[0],
838 struct btrfs_extent_data_ref);
839 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
840 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
841 ref2 = btrfs_item_ptr(leaf, path->slots[0],
842 struct btrfs_shared_data_ref);
843 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
844#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
845 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
846 struct btrfs_extent_ref_v0 *ref0;
847 ref0 = btrfs_item_ptr(leaf, path->slots[0],
848 struct btrfs_extent_ref_v0);
849 num_refs = btrfs_ref_count_v0(leaf, ref0);
850#endif
851 } else {
852 BUG();
853 }
854
637 BUG_ON(num_refs < refs_to_drop); 855 BUG_ON(num_refs < refs_to_drop);
638 num_refs -= refs_to_drop; 856 num_refs -= refs_to_drop;
857
639 if (num_refs == 0) { 858 if (num_refs == 0) {
640 ret = btrfs_del_item(trans, root, path); 859 ret = btrfs_del_item(trans, root, path);
641 } else { 860 } else {
642 btrfs_set_ref_num_refs(leaf, ref, num_refs); 861 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
862 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
863 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
864 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
865#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
866 else {
867 struct btrfs_extent_ref_v0 *ref0;
868 ref0 = btrfs_item_ptr(leaf, path->slots[0],
869 struct btrfs_extent_ref_v0);
870 btrfs_set_ref_count_v0(leaf, ref0, num_refs);
871 }
872#endif
643 btrfs_mark_buffer_dirty(leaf); 873 btrfs_mark_buffer_dirty(leaf);
644 } 874 }
875 return ret;
876}
877
878static noinline u32 extent_data_ref_count(struct btrfs_root *root,
879 struct btrfs_path *path,
880 struct btrfs_extent_inline_ref *iref)
881{
882 struct btrfs_key key;
883 struct extent_buffer *leaf;
884 struct btrfs_extent_data_ref *ref1;
885 struct btrfs_shared_data_ref *ref2;
886 u32 num_refs = 0;
887
888 leaf = path->nodes[0];
889 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
890 if (iref) {
891 if (btrfs_extent_inline_ref_type(leaf, iref) ==
892 BTRFS_EXTENT_DATA_REF_KEY) {
893 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
894 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
895 } else {
896 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
897 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
898 }
899 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
900 ref1 = btrfs_item_ptr(leaf, path->slots[0],
901 struct btrfs_extent_data_ref);
902 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
903 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
904 ref2 = btrfs_item_ptr(leaf, path->slots[0],
905 struct btrfs_shared_data_ref);
906 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
907#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
908 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
909 struct btrfs_extent_ref_v0 *ref0;
910 ref0 = btrfs_item_ptr(leaf, path->slots[0],
911 struct btrfs_extent_ref_v0);
912 num_refs = btrfs_ref_count_v0(leaf, ref0);
913#endif
914 } else {
915 WARN_ON(1);
916 }
917 return num_refs;
918}
919
920static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
921 struct btrfs_root *root,
922 struct btrfs_path *path,
923 u64 bytenr, u64 parent,
924 u64 root_objectid)
925{
926 struct btrfs_key key;
927 int ret;
928
929 key.objectid = bytenr;
930 if (parent) {
931 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
932 key.offset = parent;
933 } else {
934 key.type = BTRFS_TREE_BLOCK_REF_KEY;
935 key.offset = root_objectid;
936 }
937
938 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
939 if (ret > 0)
940 ret = -ENOENT;
941#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
942 if (ret == -ENOENT && parent) {
943 btrfs_release_path(root, path);
944 key.type = BTRFS_EXTENT_REF_V0_KEY;
945 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
946 if (ret > 0)
947 ret = -ENOENT;
948 }
949#endif
950 return ret;
951}
952
953static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
954 struct btrfs_root *root,
955 struct btrfs_path *path,
956 u64 bytenr, u64 parent,
957 u64 root_objectid)
958{
959 struct btrfs_key key;
960 int ret;
961
962 key.objectid = bytenr;
963 if (parent) {
964 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
965 key.offset = parent;
966 } else {
967 key.type = BTRFS_TREE_BLOCK_REF_KEY;
968 key.offset = root_objectid;
969 }
970
971 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
645 btrfs_release_path(root, path); 972 btrfs_release_path(root, path);
646 return ret; 973 return ret;
647} 974}
648 975
976static inline int extent_ref_type(u64 parent, u64 owner)
977{
978 int type;
979 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
980 if (parent > 0)
981 type = BTRFS_SHARED_BLOCK_REF_KEY;
982 else
983 type = BTRFS_TREE_BLOCK_REF_KEY;
984 } else {
985 if (parent > 0)
986 type = BTRFS_SHARED_DATA_REF_KEY;
987 else
988 type = BTRFS_EXTENT_DATA_REF_KEY;
989 }
990 return type;
991}
992
993static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
994
995{
996 int level;
997 BUG_ON(!path->keep_locks);
998 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
999 if (!path->nodes[level])
1000 break;
1001 btrfs_assert_tree_locked(path->nodes[level]);
1002 if (path->slots[level] + 1 >=
1003 btrfs_header_nritems(path->nodes[level]))
1004 continue;
1005 if (level == 0)
1006 btrfs_item_key_to_cpu(path->nodes[level], key,
1007 path->slots[level] + 1);
1008 else
1009 btrfs_node_key_to_cpu(path->nodes[level], key,
1010 path->slots[level] + 1);
1011 return 0;
1012 }
1013 return 1;
1014}
1015
1016/*
1017 * look for inline back ref. if back ref is found, *ref_ret is set
1018 * to the address of inline back ref, and 0 is returned.
1019 *
1020 * if back ref isn't found, *ref_ret is set to the address where it
1021 * should be inserted, and -ENOENT is returned.
1022 *
1023 * if insert is true and there are too many inline back refs, the path
1024 * points to the extent item, and -EAGAIN is returned.
1025 *
1026 * NOTE: inline back refs are ordered in the same way that back ref
1027 * items in the tree are ordered.
1028 */
1029static noinline_for_stack
1030int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1031 struct btrfs_root *root,
1032 struct btrfs_path *path,
1033 struct btrfs_extent_inline_ref **ref_ret,
1034 u64 bytenr, u64 num_bytes,
1035 u64 parent, u64 root_objectid,
1036 u64 owner, u64 offset, int insert)
1037{
1038 struct btrfs_key key;
1039 struct extent_buffer *leaf;
1040 struct btrfs_extent_item *ei;
1041 struct btrfs_extent_inline_ref *iref;
1042 u64 flags;
1043 u64 item_size;
1044 unsigned long ptr;
1045 unsigned long end;
1046 int extra_size;
1047 int type;
1048 int want;
1049 int ret;
1050 int err = 0;
1051
1052 key.objectid = bytenr;
1053 key.type = BTRFS_EXTENT_ITEM_KEY;
1054 key.offset = num_bytes;
1055
1056 want = extent_ref_type(parent, owner);
1057 if (insert) {
1058 extra_size = btrfs_extent_inline_ref_size(want);
1059 path->keep_locks = 1;
1060 } else
1061 extra_size = -1;
1062 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1063 if (ret < 0) {
1064 err = ret;
1065 goto out;
1066 }
1067 BUG_ON(ret);
1068
1069 leaf = path->nodes[0];
1070 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1071#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1072 if (item_size < sizeof(*ei)) {
1073 if (!insert) {
1074 err = -ENOENT;
1075 goto out;
1076 }
1077 ret = convert_extent_item_v0(trans, root, path, owner,
1078 extra_size);
1079 if (ret < 0) {
1080 err = ret;
1081 goto out;
1082 }
1083 leaf = path->nodes[0];
1084 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1085 }
1086#endif
1087 BUG_ON(item_size < sizeof(*ei));
1088
1089 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1090 flags = btrfs_extent_flags(leaf, ei);
1091
1092 ptr = (unsigned long)(ei + 1);
1093 end = (unsigned long)ei + item_size;
1094
1095 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1096 ptr += sizeof(struct btrfs_tree_block_info);
1097 BUG_ON(ptr > end);
1098 } else {
1099 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1100 }
1101
1102 err = -ENOENT;
1103 while (1) {
1104 if (ptr >= end) {
1105 WARN_ON(ptr > end);
1106 break;
1107 }
1108 iref = (struct btrfs_extent_inline_ref *)ptr;
1109 type = btrfs_extent_inline_ref_type(leaf, iref);
1110 if (want < type)
1111 break;
1112 if (want > type) {
1113 ptr += btrfs_extent_inline_ref_size(type);
1114 continue;
1115 }
1116
1117 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1118 struct btrfs_extent_data_ref *dref;
1119 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1120 if (match_extent_data_ref(leaf, dref, root_objectid,
1121 owner, offset)) {
1122 err = 0;
1123 break;
1124 }
1125 if (hash_extent_data_ref_item(leaf, dref) <
1126 hash_extent_data_ref(root_objectid, owner, offset))
1127 break;
1128 } else {
1129 u64 ref_offset;
1130 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1131 if (parent > 0) {
1132 if (parent == ref_offset) {
1133 err = 0;
1134 break;
1135 }
1136 if (ref_offset < parent)
1137 break;
1138 } else {
1139 if (root_objectid == ref_offset) {
1140 err = 0;
1141 break;
1142 }
1143 if (ref_offset < root_objectid)
1144 break;
1145 }
1146 }
1147 ptr += btrfs_extent_inline_ref_size(type);
1148 }
1149 if (err == -ENOENT && insert) {
1150 if (item_size + extra_size >=
1151 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1152 err = -EAGAIN;
1153 goto out;
1154 }
1155 /*
1156 * To add new inline back ref, we have to make sure
1157 * there is no corresponding back ref item.
1158 * For simplicity, we just do not add new inline back
1159 * ref if there is any kind of item for this block
1160 */
1161 if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
1162 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1163 err = -EAGAIN;
1164 goto out;
1165 }
1166 }
1167 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1168out:
1169 if (insert) {
1170 path->keep_locks = 0;
1171 btrfs_unlock_up_safe(path, 1);
1172 }
1173 return err;
1174}
1175
1176/*
1177 * helper to add new inline back ref
1178 */
1179static noinline_for_stack
1180int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1181 struct btrfs_root *root,
1182 struct btrfs_path *path,
1183 struct btrfs_extent_inline_ref *iref,
1184 u64 parent, u64 root_objectid,
1185 u64 owner, u64 offset, int refs_to_add,
1186 struct btrfs_delayed_extent_op *extent_op)
1187{
1188 struct extent_buffer *leaf;
1189 struct btrfs_extent_item *ei;
1190 unsigned long ptr;
1191 unsigned long end;
1192 unsigned long item_offset;
1193 u64 refs;
1194 int size;
1195 int type;
1196 int ret;
1197
1198 leaf = path->nodes[0];
1199 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1200 item_offset = (unsigned long)iref - (unsigned long)ei;
1201
1202 type = extent_ref_type(parent, owner);
1203 size = btrfs_extent_inline_ref_size(type);
1204
1205 ret = btrfs_extend_item(trans, root, path, size);
1206 BUG_ON(ret);
1207
1208 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1209 refs = btrfs_extent_refs(leaf, ei);
1210 refs += refs_to_add;
1211 btrfs_set_extent_refs(leaf, ei, refs);
1212 if (extent_op)
1213 __run_delayed_extent_op(extent_op, leaf, ei);
1214
1215 ptr = (unsigned long)ei + item_offset;
1216 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1217 if (ptr < end - size)
1218 memmove_extent_buffer(leaf, ptr + size, ptr,
1219 end - size - ptr);
1220
1221 iref = (struct btrfs_extent_inline_ref *)ptr;
1222 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1223 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1224 struct btrfs_extent_data_ref *dref;
1225 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1226 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1227 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1228 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1229 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1230 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1231 struct btrfs_shared_data_ref *sref;
1232 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1233 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1234 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1235 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1236 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1237 } else {
1238 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1239 }
1240 btrfs_mark_buffer_dirty(leaf);
1241 return 0;
1242}
1243
1244static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1245 struct btrfs_root *root,
1246 struct btrfs_path *path,
1247 struct btrfs_extent_inline_ref **ref_ret,
1248 u64 bytenr, u64 num_bytes, u64 parent,
1249 u64 root_objectid, u64 owner, u64 offset)
1250{
1251 int ret;
1252
1253 ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1254 bytenr, num_bytes, parent,
1255 root_objectid, owner, offset, 0);
1256 if (ret != -ENOENT)
1257 return ret;
1258
1259 btrfs_release_path(root, path);
1260 *ref_ret = NULL;
1261
1262 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1263 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1264 root_objectid);
1265 } else {
1266 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1267 root_objectid, owner, offset);
1268 }
1269 return ret;
1270}
1271
1272/*
1273 * helper to update/remove inline back ref
1274 */
1275static noinline_for_stack
1276int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1277 struct btrfs_root *root,
1278 struct btrfs_path *path,
1279 struct btrfs_extent_inline_ref *iref,
1280 int refs_to_mod,
1281 struct btrfs_delayed_extent_op *extent_op)
1282{
1283 struct extent_buffer *leaf;
1284 struct btrfs_extent_item *ei;
1285 struct btrfs_extent_data_ref *dref = NULL;
1286 struct btrfs_shared_data_ref *sref = NULL;
1287 unsigned long ptr;
1288 unsigned long end;
1289 u32 item_size;
1290 int size;
1291 int type;
1292 int ret;
1293 u64 refs;
1294
1295 leaf = path->nodes[0];
1296 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1297 refs = btrfs_extent_refs(leaf, ei);
1298 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1299 refs += refs_to_mod;
1300 btrfs_set_extent_refs(leaf, ei, refs);
1301 if (extent_op)
1302 __run_delayed_extent_op(extent_op, leaf, ei);
1303
1304 type = btrfs_extent_inline_ref_type(leaf, iref);
1305
1306 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1307 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1308 refs = btrfs_extent_data_ref_count(leaf, dref);
1309 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1310 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1311 refs = btrfs_shared_data_ref_count(leaf, sref);
1312 } else {
1313 refs = 1;
1314 BUG_ON(refs_to_mod != -1);
1315 }
1316
1317 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1318 refs += refs_to_mod;
1319
1320 if (refs > 0) {
1321 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1322 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1323 else
1324 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1325 } else {
1326 size = btrfs_extent_inline_ref_size(type);
1327 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1328 ptr = (unsigned long)iref;
1329 end = (unsigned long)ei + item_size;
1330 if (ptr + size < end)
1331 memmove_extent_buffer(leaf, ptr, ptr + size,
1332 end - ptr - size);
1333 item_size -= size;
1334 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1335 BUG_ON(ret);
1336 }
1337 btrfs_mark_buffer_dirty(leaf);
1338 return 0;
1339}
1340
1341static noinline_for_stack
1342int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1343 struct btrfs_root *root,
1344 struct btrfs_path *path,
1345 u64 bytenr, u64 num_bytes, u64 parent,
1346 u64 root_objectid, u64 owner,
1347 u64 offset, int refs_to_add,
1348 struct btrfs_delayed_extent_op *extent_op)
1349{
1350 struct btrfs_extent_inline_ref *iref;
1351 int ret;
1352
1353 ret = lookup_inline_extent_backref(trans, root, path, &iref,
1354 bytenr, num_bytes, parent,
1355 root_objectid, owner, offset, 1);
1356 if (ret == 0) {
1357 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1358 ret = update_inline_extent_backref(trans, root, path, iref,
1359 refs_to_add, extent_op);
1360 } else if (ret == -ENOENT) {
1361 ret = setup_inline_extent_backref(trans, root, path, iref,
1362 parent, root_objectid,
1363 owner, offset, refs_to_add,
1364 extent_op);
1365 }
1366 return ret;
1367}
1368
1369static int insert_extent_backref(struct btrfs_trans_handle *trans,
1370 struct btrfs_root *root,
1371 struct btrfs_path *path,
1372 u64 bytenr, u64 parent, u64 root_objectid,
1373 u64 owner, u64 offset, int refs_to_add)
1374{
1375 int ret;
1376 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1377 BUG_ON(refs_to_add != 1);
1378 ret = insert_tree_block_ref(trans, root, path, bytenr,
1379 parent, root_objectid);
1380 } else {
1381 ret = insert_extent_data_ref(trans, root, path, bytenr,
1382 parent, root_objectid,
1383 owner, offset, refs_to_add);
1384 }
1385 return ret;
1386}
1387
1388static int remove_extent_backref(struct btrfs_trans_handle *trans,
1389 struct btrfs_root *root,
1390 struct btrfs_path *path,
1391 struct btrfs_extent_inline_ref *iref,
1392 int refs_to_drop, int is_data)
1393{
1394 int ret;
1395
1396 BUG_ON(!is_data && refs_to_drop != 1);
1397 if (iref) {
1398 ret = update_inline_extent_backref(trans, root, path, iref,
1399 -refs_to_drop, NULL);
1400 } else if (is_data) {
1401 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1402 } else {
1403 ret = btrfs_del_item(trans, root, path);
1404 }
1405 return ret;
1406}
1407
649#ifdef BIO_RW_DISCARD 1408#ifdef BIO_RW_DISCARD
650static void btrfs_issue_discard(struct block_device *bdev, 1409static void btrfs_issue_discard(struct block_device *bdev,
651 u64 start, u64 len) 1410 u64 start, u64 len)
@@ -686,71 +1445,40 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
686#endif 1445#endif
687} 1446}
688 1447
689static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 1448int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
690 struct btrfs_root *root, u64 bytenr, 1449 struct btrfs_root *root,
691 u64 num_bytes, 1450 u64 bytenr, u64 num_bytes, u64 parent,
692 u64 orig_parent, u64 parent, 1451 u64 root_objectid, u64 owner, u64 offset)
693 u64 orig_root, u64 ref_root,
694 u64 orig_generation, u64 ref_generation,
695 u64 owner_objectid)
696{ 1452{
697 int ret; 1453 int ret;
698 int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID; 1454 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1455 root_objectid == BTRFS_TREE_LOG_OBJECTID);
699 1456
700 ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes, 1457 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
701 orig_parent, parent, orig_root, 1458 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
702 ref_root, orig_generation, 1459 parent, root_objectid, (int)owner,
703 ref_generation, owner_objectid, pin); 1460 BTRFS_ADD_DELAYED_REF, NULL);
704 BUG_ON(ret); 1461 } else {
1462 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1463 parent, root_objectid, owner, offset,
1464 BTRFS_ADD_DELAYED_REF, NULL);
1465 }
705 return ret; 1466 return ret;
706} 1467}
707 1468
708int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
709 struct btrfs_root *root, u64 bytenr,
710 u64 num_bytes, u64 orig_parent, u64 parent,
711 u64 ref_root, u64 ref_generation,
712 u64 owner_objectid)
713{
714 int ret;
715 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
716 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
717 return 0;
718
719 ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
720 orig_parent, parent, ref_root,
721 ref_root, ref_generation,
722 ref_generation, owner_objectid);
723 return ret;
724}
725static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1469static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
726 struct btrfs_root *root, u64 bytenr, 1470 struct btrfs_root *root,
727 u64 num_bytes, 1471 u64 bytenr, u64 num_bytes,
728 u64 orig_parent, u64 parent, 1472 u64 parent, u64 root_objectid,
729 u64 orig_root, u64 ref_root, 1473 u64 owner, u64 offset, int refs_to_add,
730 u64 orig_generation, u64 ref_generation, 1474 struct btrfs_delayed_extent_op *extent_op)
731 u64 owner_objectid)
732{
733 int ret;
734
735 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
736 ref_generation, owner_objectid,
737 BTRFS_ADD_DELAYED_REF, 0);
738 BUG_ON(ret);
739 return ret;
740}
741
742static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
743 struct btrfs_root *root, u64 bytenr,
744 u64 num_bytes, u64 parent, u64 ref_root,
745 u64 ref_generation, u64 owner_objectid,
746 int refs_to_add)
747{ 1475{
748 struct btrfs_path *path; 1476 struct btrfs_path *path;
749 int ret; 1477 struct extent_buffer *leaf;
750 struct btrfs_key key;
751 struct extent_buffer *l;
752 struct btrfs_extent_item *item; 1478 struct btrfs_extent_item *item;
753 u32 refs; 1479 u64 refs;
1480 int ret;
1481 int err = 0;
754 1482
755 path = btrfs_alloc_path(); 1483 path = btrfs_alloc_path();
756 if (!path) 1484 if (!path)
@@ -758,43 +1486,27 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
758 1486
759 path->reada = 1; 1487 path->reada = 1;
760 path->leave_spinning = 1; 1488 path->leave_spinning = 1;
761 key.objectid = bytenr; 1489 /* this will setup the path even if it fails to insert the back ref */
762 key.type = BTRFS_EXTENT_ITEM_KEY; 1490 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
763 key.offset = num_bytes; 1491 path, bytenr, num_bytes, parent,
764 1492 root_objectid, owner, offset,
765 /* first find the extent item and update its reference count */ 1493 refs_to_add, extent_op);
766 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 1494 if (ret == 0)
767 path, 0, 1); 1495 goto out;
768 if (ret < 0) {
769 btrfs_set_path_blocking(path);
770 return ret;
771 }
772
773 if (ret > 0) {
774 WARN_ON(1);
775 btrfs_free_path(path);
776 return -EIO;
777 }
778 l = path->nodes[0];
779 1496
780 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 1497 if (ret != -EAGAIN) {
781 if (key.objectid != bytenr) { 1498 err = ret;
782 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); 1499 goto out;
783 printk(KERN_ERR "btrfs wanted %llu found %llu\n",
784 (unsigned long long)bytenr,
785 (unsigned long long)key.objectid);
786 BUG();
787 } 1500 }
788 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
789
790 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
791
792 refs = btrfs_extent_refs(l, item);
793 btrfs_set_extent_refs(l, item, refs + refs_to_add);
794 btrfs_unlock_up_safe(path, 1);
795 1501
796 btrfs_mark_buffer_dirty(path->nodes[0]); 1502 leaf = path->nodes[0];
1503 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1504 refs = btrfs_extent_refs(leaf, item);
1505 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1506 if (extent_op)
1507 __run_delayed_extent_op(extent_op, leaf, item);
797 1508
1509 btrfs_mark_buffer_dirty(leaf);
798 btrfs_release_path(root->fs_info->extent_root, path); 1510 btrfs_release_path(root->fs_info->extent_root, path);
799 1511
800 path->reada = 1; 1512 path->reada = 1;
@@ -802,56 +1514,197 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
802 1514
803 /* now insert the actual backref */ 1515 /* now insert the actual backref */
804 ret = insert_extent_backref(trans, root->fs_info->extent_root, 1516 ret = insert_extent_backref(trans, root->fs_info->extent_root,
805 path, bytenr, parent, 1517 path, bytenr, parent, root_objectid,
806 ref_root, ref_generation, 1518 owner, offset, refs_to_add);
807 owner_objectid, refs_to_add);
808 BUG_ON(ret); 1519 BUG_ON(ret);
1520out:
809 btrfs_free_path(path); 1521 btrfs_free_path(path);
810 return 0; 1522 return err;
811} 1523}
812 1524
813int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1525static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
814 struct btrfs_root *root, 1526 struct btrfs_root *root,
815 u64 bytenr, u64 num_bytes, u64 parent, 1527 struct btrfs_delayed_ref_node *node,
816 u64 ref_root, u64 ref_generation, 1528 struct btrfs_delayed_extent_op *extent_op,
817 u64 owner_objectid) 1529 int insert_reserved)
818{ 1530{
819 int ret; 1531 int ret = 0;
820 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 1532 struct btrfs_delayed_data_ref *ref;
821 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 1533 struct btrfs_key ins;
822 return 0; 1534 u64 parent = 0;
1535 u64 ref_root = 0;
1536 u64 flags = 0;
823 1537
824 ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent, 1538 ins.objectid = node->bytenr;
825 0, ref_root, 0, ref_generation, 1539 ins.offset = node->num_bytes;
826 owner_objectid); 1540 ins.type = BTRFS_EXTENT_ITEM_KEY;
1541
1542 ref = btrfs_delayed_node_to_data_ref(node);
1543 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1544 parent = ref->parent;
1545 else
1546 ref_root = ref->root;
1547
1548 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1549 if (extent_op) {
1550 BUG_ON(extent_op->update_key);
1551 flags |= extent_op->flags_to_set;
1552 }
1553 ret = alloc_reserved_file_extent(trans, root,
1554 parent, ref_root, flags,
1555 ref->objectid, ref->offset,
1556 &ins, node->ref_mod);
1557 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1558 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1559 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1560 node->num_bytes, parent,
1561 ref_root, ref->objectid,
1562 ref->offset, node->ref_mod,
1563 extent_op);
1564 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1565 ret = __btrfs_free_extent(trans, root, node->bytenr,
1566 node->num_bytes, parent,
1567 ref_root, ref->objectid,
1568 ref->offset, node->ref_mod,
1569 extent_op);
1570 } else {
1571 BUG();
1572 }
827 return ret; 1573 return ret;
828} 1574}
829 1575
830static int drop_delayed_ref(struct btrfs_trans_handle *trans, 1576static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
831 struct btrfs_root *root, 1577 struct extent_buffer *leaf,
832 struct btrfs_delayed_ref_node *node) 1578 struct btrfs_extent_item *ei)
1579{
1580 u64 flags = btrfs_extent_flags(leaf, ei);
1581 if (extent_op->update_flags) {
1582 flags |= extent_op->flags_to_set;
1583 btrfs_set_extent_flags(leaf, ei, flags);
1584 }
1585
1586 if (extent_op->update_key) {
1587 struct btrfs_tree_block_info *bi;
1588 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1589 bi = (struct btrfs_tree_block_info *)(ei + 1);
1590 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1591 }
1592}
1593
1594static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1595 struct btrfs_root *root,
1596 struct btrfs_delayed_ref_node *node,
1597 struct btrfs_delayed_extent_op *extent_op)
1598{
1599 struct btrfs_key key;
1600 struct btrfs_path *path;
1601 struct btrfs_extent_item *ei;
1602 struct extent_buffer *leaf;
1603 u32 item_size;
1604 int ret;
1605 int err = 0;
1606
1607 path = btrfs_alloc_path();
1608 if (!path)
1609 return -ENOMEM;
1610
1611 key.objectid = node->bytenr;
1612 key.type = BTRFS_EXTENT_ITEM_KEY;
1613 key.offset = node->num_bytes;
1614
1615 path->reada = 1;
1616 path->leave_spinning = 1;
1617 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1618 path, 0, 1);
1619 if (ret < 0) {
1620 err = ret;
1621 goto out;
1622 }
1623 if (ret > 0) {
1624 err = -EIO;
1625 goto out;
1626 }
1627
1628 leaf = path->nodes[0];
1629 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1630#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1631 if (item_size < sizeof(*ei)) {
1632 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1633 path, (u64)-1, 0);
1634 if (ret < 0) {
1635 err = ret;
1636 goto out;
1637 }
1638 leaf = path->nodes[0];
1639 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1640 }
1641#endif
1642 BUG_ON(item_size < sizeof(*ei));
1643 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1644 __run_delayed_extent_op(extent_op, leaf, ei);
1645
1646 btrfs_mark_buffer_dirty(leaf);
1647out:
1648 btrfs_free_path(path);
1649 return err;
1650}
1651
1652static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1653 struct btrfs_root *root,
1654 struct btrfs_delayed_ref_node *node,
1655 struct btrfs_delayed_extent_op *extent_op,
1656 int insert_reserved)
833{ 1657{
834 int ret = 0; 1658 int ret = 0;
835 struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node); 1659 struct btrfs_delayed_tree_ref *ref;
1660 struct btrfs_key ins;
1661 u64 parent = 0;
1662 u64 ref_root = 0;
836 1663
837 BUG_ON(node->ref_mod == 0); 1664 ins.objectid = node->bytenr;
838 ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, 1665 ins.offset = node->num_bytes;
839 node->parent, ref->root, ref->generation, 1666 ins.type = BTRFS_EXTENT_ITEM_KEY;
840 ref->owner_objectid, ref->pin, node->ref_mod);
841 1667
1668 ref = btrfs_delayed_node_to_tree_ref(node);
1669 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1670 parent = ref->parent;
1671 else
1672 ref_root = ref->root;
1673
1674 BUG_ON(node->ref_mod != 1);
1675 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1676 BUG_ON(!extent_op || !extent_op->update_flags ||
1677 !extent_op->update_key);
1678 ret = alloc_reserved_tree_block(trans, root,
1679 parent, ref_root,
1680 extent_op->flags_to_set,
1681 &extent_op->key,
1682 ref->level, &ins);
1683 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1684 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1685 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1686 node->num_bytes, parent, ref_root,
1687 ref->level, 0, 1, extent_op);
1688 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1689 ret = __btrfs_free_extent(trans, root, node->bytenr,
1690 node->num_bytes, parent, ref_root,
1691 ref->level, 0, 1, extent_op);
1692 } else {
1693 BUG();
1694 }
842 return ret; 1695 return ret;
843} 1696}
844 1697
1698
845/* helper function to actually process a single delayed ref entry */ 1699/* helper function to actually process a single delayed ref entry */
846static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1700static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root, 1701 struct btrfs_root *root,
848 struct btrfs_delayed_ref_node *node, 1702 struct btrfs_delayed_ref_node *node,
849 int insert_reserved) 1703 struct btrfs_delayed_extent_op *extent_op,
1704 int insert_reserved)
850{ 1705{
851 int ret; 1706 int ret;
852 struct btrfs_delayed_ref *ref; 1707 if (btrfs_delayed_ref_is_head(node)) {
853
854 if (node->parent == (u64)-1) {
855 struct btrfs_delayed_ref_head *head; 1708 struct btrfs_delayed_ref_head *head;
856 /* 1709 /*
857 * we've hit the end of the chain and we were supposed 1710 * we've hit the end of the chain and we were supposed
@@ -859,44 +1712,35 @@ static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
859 * deleted before we ever needed to insert it, so all 1712 * deleted before we ever needed to insert it, so all
860 * we have to do is clean up the accounting 1713 * we have to do is clean up the accounting
861 */ 1714 */
1715 BUG_ON(extent_op);
1716 head = btrfs_delayed_node_to_head(node);
862 if (insert_reserved) { 1717 if (insert_reserved) {
1718 if (head->is_data) {
1719 ret = btrfs_del_csums(trans, root,
1720 node->bytenr,
1721 node->num_bytes);
1722 BUG_ON(ret);
1723 }
1724 btrfs_update_pinned_extents(root, node->bytenr,
1725 node->num_bytes, 1);
863 update_reserved_extents(root, node->bytenr, 1726 update_reserved_extents(root, node->bytenr,
864 node->num_bytes, 0); 1727 node->num_bytes, 0);
865 } 1728 }
866 head = btrfs_delayed_node_to_head(node);
867 mutex_unlock(&head->mutex); 1729 mutex_unlock(&head->mutex);
868 return 0; 1730 return 0;
869 } 1731 }
870 1732
871 ref = btrfs_delayed_node_to_ref(node); 1733 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
872 if (ref->action == BTRFS_ADD_DELAYED_REF) { 1734 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
873 if (insert_reserved) { 1735 ret = run_delayed_tree_ref(trans, root, node, extent_op,
874 struct btrfs_key ins; 1736 insert_reserved);
875 1737 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
876 ins.objectid = node->bytenr; 1738 node->type == BTRFS_SHARED_DATA_REF_KEY)
877 ins.offset = node->num_bytes; 1739 ret = run_delayed_data_ref(trans, root, node, extent_op,
878 ins.type = BTRFS_EXTENT_ITEM_KEY; 1740 insert_reserved);
879 1741 else
880 /* record the full extent allocation */ 1742 BUG();
881 ret = __btrfs_alloc_reserved_extent(trans, root, 1743 return ret;
882 node->parent, ref->root,
883 ref->generation, ref->owner_objectid,
884 &ins, node->ref_mod);
885 update_reserved_extents(root, node->bytenr,
886 node->num_bytes, 0);
887 } else {
888 /* just add one backref */
889 ret = add_extent_ref(trans, root, node->bytenr,
890 node->num_bytes,
891 node->parent, ref->root, ref->generation,
892 ref->owner_objectid, node->ref_mod);
893 }
894 BUG_ON(ret);
895 } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
896 WARN_ON(insert_reserved);
897 ret = drop_delayed_ref(trans, root, node);
898 }
899 return 0;
900} 1744}
901 1745
902static noinline struct btrfs_delayed_ref_node * 1746static noinline struct btrfs_delayed_ref_node *
@@ -919,7 +1763,7 @@ again:
919 rb_node); 1763 rb_node);
920 if (ref->bytenr != head->node.bytenr) 1764 if (ref->bytenr != head->node.bytenr)
921 break; 1765 break;
922 if (btrfs_delayed_node_to_ref(ref)->action == action) 1766 if (ref->action == action)
923 return ref; 1767 return ref;
924 node = rb_prev(node); 1768 node = rb_prev(node);
925 } 1769 }
@@ -937,6 +1781,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
937 struct btrfs_delayed_ref_root *delayed_refs; 1781 struct btrfs_delayed_ref_root *delayed_refs;
938 struct btrfs_delayed_ref_node *ref; 1782 struct btrfs_delayed_ref_node *ref;
939 struct btrfs_delayed_ref_head *locked_ref = NULL; 1783 struct btrfs_delayed_ref_head *locked_ref = NULL;
1784 struct btrfs_delayed_extent_op *extent_op;
940 int ret; 1785 int ret;
941 int count = 0; 1786 int count = 0;
942 int must_insert_reserved = 0; 1787 int must_insert_reserved = 0;
@@ -975,6 +1820,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
975 must_insert_reserved = locked_ref->must_insert_reserved; 1820 must_insert_reserved = locked_ref->must_insert_reserved;
976 locked_ref->must_insert_reserved = 0; 1821 locked_ref->must_insert_reserved = 0;
977 1822
1823 extent_op = locked_ref->extent_op;
1824 locked_ref->extent_op = NULL;
1825
978 /* 1826 /*
979 * locked_ref is the head node, so we have to go one 1827 * locked_ref is the head node, so we have to go one
980 * node back for any delayed ref updates 1828 * node back for any delayed ref updates
@@ -986,6 +1834,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
986 * so that any accounting fixes can happen 1834 * so that any accounting fixes can happen
987 */ 1835 */
988 ref = &locked_ref->node; 1836 ref = &locked_ref->node;
1837
1838 if (extent_op && must_insert_reserved) {
1839 kfree(extent_op);
1840 extent_op = NULL;
1841 }
1842
1843 if (extent_op) {
1844 spin_unlock(&delayed_refs->lock);
1845
1846 ret = run_delayed_extent_op(trans, root,
1847 ref, extent_op);
1848 BUG_ON(ret);
1849 kfree(extent_op);
1850
1851 cond_resched();
1852 spin_lock(&delayed_refs->lock);
1853 continue;
1854 }
1855
989 list_del_init(&locked_ref->cluster); 1856 list_del_init(&locked_ref->cluster);
990 locked_ref = NULL; 1857 locked_ref = NULL;
991 } 1858 }
@@ -993,14 +1860,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
993 ref->in_tree = 0; 1860 ref->in_tree = 0;
994 rb_erase(&ref->rb_node, &delayed_refs->root); 1861 rb_erase(&ref->rb_node, &delayed_refs->root);
995 delayed_refs->num_entries--; 1862 delayed_refs->num_entries--;
1863
996 spin_unlock(&delayed_refs->lock); 1864 spin_unlock(&delayed_refs->lock);
997 1865
998 ret = run_one_delayed_ref(trans, root, ref, 1866 ret = run_one_delayed_ref(trans, root, ref, extent_op,
999 must_insert_reserved); 1867 must_insert_reserved);
1000 BUG_ON(ret); 1868 BUG_ON(ret);
1001 btrfs_put_delayed_ref(ref);
1002 1869
1870 btrfs_put_delayed_ref(ref);
1871 kfree(extent_op);
1003 count++; 1872 count++;
1873
1004 cond_resched(); 1874 cond_resched();
1005 spin_lock(&delayed_refs->lock); 1875 spin_lock(&delayed_refs->lock);
1006 } 1876 }
@@ -1095,25 +1965,112 @@ out:
1095 return 0; 1965 return 0;
1096} 1966}
1097 1967
1098int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1968int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
1099 struct btrfs_root *root, u64 objectid, u64 bytenr) 1969 struct btrfs_root *root,
1970 u64 bytenr, u64 num_bytes, u64 flags,
1971 int is_data)
1972{
1973 struct btrfs_delayed_extent_op *extent_op;
1974 int ret;
1975
1976 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1977 if (!extent_op)
1978 return -ENOMEM;
1979
1980 extent_op->flags_to_set = flags;
1981 extent_op->update_flags = 1;
1982 extent_op->update_key = 0;
1983 extent_op->is_data = is_data ? 1 : 0;
1984
1985 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
1986 if (ret)
1987 kfree(extent_op);
1988 return ret;
1989}
1990
1991static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
1992 struct btrfs_root *root,
1993 struct btrfs_path *path,
1994 u64 objectid, u64 offset, u64 bytenr)
1995{
1996 struct btrfs_delayed_ref_head *head;
1997 struct btrfs_delayed_ref_node *ref;
1998 struct btrfs_delayed_data_ref *data_ref;
1999 struct btrfs_delayed_ref_root *delayed_refs;
2000 struct rb_node *node;
2001 int ret = 0;
2002
2003 ret = -ENOENT;
2004 delayed_refs = &trans->transaction->delayed_refs;
2005 spin_lock(&delayed_refs->lock);
2006 head = btrfs_find_delayed_ref_head(trans, bytenr);
2007 if (!head)
2008 goto out;
2009
2010 if (!mutex_trylock(&head->mutex)) {
2011 atomic_inc(&head->node.refs);
2012 spin_unlock(&delayed_refs->lock);
2013
2014 btrfs_release_path(root->fs_info->extent_root, path);
2015
2016 mutex_lock(&head->mutex);
2017 mutex_unlock(&head->mutex);
2018 btrfs_put_delayed_ref(&head->node);
2019 return -EAGAIN;
2020 }
2021
2022 node = rb_prev(&head->node.rb_node);
2023 if (!node)
2024 goto out_unlock;
2025
2026 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2027
2028 if (ref->bytenr != bytenr)
2029 goto out_unlock;
2030
2031 ret = 1;
2032 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2033 goto out_unlock;
2034
2035 data_ref = btrfs_delayed_node_to_data_ref(ref);
2036
2037 node = rb_prev(node);
2038 if (node) {
2039 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2040 if (ref->bytenr == bytenr)
2041 goto out_unlock;
2042 }
2043
2044 if (data_ref->root != root->root_key.objectid ||
2045 data_ref->objectid != objectid || data_ref->offset != offset)
2046 goto out_unlock;
2047
2048 ret = 0;
2049out_unlock:
2050 mutex_unlock(&head->mutex);
2051out:
2052 spin_unlock(&delayed_refs->lock);
2053 return ret;
2054}
2055
2056static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2057 struct btrfs_root *root,
2058 struct btrfs_path *path,
2059 u64 objectid, u64 offset, u64 bytenr)
1100{ 2060{
1101 struct btrfs_root *extent_root = root->fs_info->extent_root; 2061 struct btrfs_root *extent_root = root->fs_info->extent_root;
1102 struct btrfs_path *path;
1103 struct extent_buffer *leaf; 2062 struct extent_buffer *leaf;
1104 struct btrfs_extent_ref *ref_item; 2063 struct btrfs_extent_data_ref *ref;
2064 struct btrfs_extent_inline_ref *iref;
2065 struct btrfs_extent_item *ei;
1105 struct btrfs_key key; 2066 struct btrfs_key key;
1106 struct btrfs_key found_key; 2067 u32 item_size;
1107 u64 ref_root;
1108 u64 last_snapshot;
1109 u32 nritems;
1110 int ret; 2068 int ret;
1111 2069
1112 key.objectid = bytenr; 2070 key.objectid = bytenr;
1113 key.offset = (u64)-1; 2071 key.offset = (u64)-1;
1114 key.type = BTRFS_EXTENT_ITEM_KEY; 2072 key.type = BTRFS_EXTENT_ITEM_KEY;
1115 2073
1116 path = btrfs_alloc_path();
1117 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2074 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1118 if (ret < 0) 2075 if (ret < 0)
1119 goto out; 2076 goto out;
@@ -1125,55 +2082,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1125 2082
1126 path->slots[0]--; 2083 path->slots[0]--;
1127 leaf = path->nodes[0]; 2084 leaf = path->nodes[0];
1128 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2085 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1129 2086
1130 if (found_key.objectid != bytenr || 2087 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
1131 found_key.type != BTRFS_EXTENT_ITEM_KEY)
1132 goto out; 2088 goto out;
1133 2089
1134 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2090 ret = 1;
1135 while (1) { 2091 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1136 leaf = path->nodes[0]; 2092#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1137 nritems = btrfs_header_nritems(leaf); 2093 if (item_size < sizeof(*ei)) {
1138 if (path->slots[0] >= nritems) { 2094 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
1139 ret = btrfs_next_leaf(extent_root, path); 2095 goto out;
1140 if (ret < 0) 2096 }
1141 goto out; 2097#endif
1142 if (ret == 0) 2098 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1143 continue;
1144 break;
1145 }
1146 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1147 if (found_key.objectid != bytenr)
1148 break;
1149 2099
1150 if (found_key.type != BTRFS_EXTENT_REF_KEY) { 2100 if (item_size != sizeof(*ei) +
1151 path->slots[0]++; 2101 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
1152 continue; 2102 goto out;
1153 }
1154 2103
1155 ref_item = btrfs_item_ptr(leaf, path->slots[0], 2104 if (btrfs_extent_generation(leaf, ei) <=
1156 struct btrfs_extent_ref); 2105 btrfs_root_last_snapshot(&root->root_item))
1157 ref_root = btrfs_ref_root(leaf, ref_item); 2106 goto out;
1158 if ((ref_root != root->root_key.objectid && 2107
1159 ref_root != BTRFS_TREE_LOG_OBJECTID) || 2108 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
1160 objectid != btrfs_ref_objectid(leaf, ref_item)) { 2109 if (btrfs_extent_inline_ref_type(leaf, iref) !=
1161 ret = 1; 2110 BTRFS_EXTENT_DATA_REF_KEY)
1162 goto out; 2111 goto out;
1163 } 2112
1164 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { 2113 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
1165 ret = 1; 2114 if (btrfs_extent_refs(leaf, ei) !=
2115 btrfs_extent_data_ref_count(leaf, ref) ||
2116 btrfs_extent_data_ref_root(leaf, ref) !=
2117 root->root_key.objectid ||
2118 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2119 btrfs_extent_data_ref_offset(leaf, ref) != offset)
2120 goto out;
2121
2122 ret = 0;
2123out:
2124 return ret;
2125}
2126
2127int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2128 struct btrfs_root *root,
2129 u64 objectid, u64 offset, u64 bytenr)
2130{
2131 struct btrfs_path *path;
2132 int ret;
2133 int ret2;
2134
2135 path = btrfs_alloc_path();
2136 if (!path)
2137 return -ENOENT;
2138
2139 do {
2140 ret = check_committed_ref(trans, root, path, objectid,
2141 offset, bytenr);
2142 if (ret && ret != -ENOENT)
1166 goto out; 2143 goto out;
1167 }
1168 2144
1169 path->slots[0]++; 2145 ret2 = check_delayed_ref(trans, root, path, objectid,
2146 offset, bytenr);
2147 } while (ret2 == -EAGAIN);
2148
2149 if (ret2 && ret2 != -ENOENT) {
2150 ret = ret2;
2151 goto out;
1170 } 2152 }
1171 ret = 0; 2153
2154 if (ret != -ENOENT || ret2 != -ENOENT)
2155 ret = 0;
1172out: 2156out:
1173 btrfs_free_path(path); 2157 btrfs_free_path(path);
1174 return ret; 2158 return ret;
1175} 2159}
1176 2160
2161#if 0
1177int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2162int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1178 struct extent_buffer *buf, u32 nr_extents) 2163 struct extent_buffer *buf, u32 nr_extents)
1179{ 2164{
@@ -1291,62 +2276,44 @@ static int refsort_cmp(const void *a_void, const void *b_void)
1291 return 1; 2276 return 1;
1292 return 0; 2277 return 0;
1293} 2278}
2279#endif
1294 2280
1295 2281static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
1296noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1297 struct btrfs_root *root, 2282 struct btrfs_root *root,
1298 struct extent_buffer *orig_buf, 2283 struct extent_buffer *buf,
1299 struct extent_buffer *buf, u32 *nr_extents) 2284 int full_backref, int inc)
1300{ 2285{
1301 u64 bytenr; 2286 u64 bytenr;
2287 u64 num_bytes;
2288 u64 parent;
1302 u64 ref_root; 2289 u64 ref_root;
1303 u64 orig_root;
1304 u64 ref_generation;
1305 u64 orig_generation;
1306 struct refsort *sorted;
1307 u32 nritems; 2290 u32 nritems;
1308 u32 nr_file_extents = 0;
1309 struct btrfs_key key; 2291 struct btrfs_key key;
1310 struct btrfs_file_extent_item *fi; 2292 struct btrfs_file_extent_item *fi;
1311 int i; 2293 int i;
1312 int level; 2294 int level;
1313 int ret = 0; 2295 int ret = 0;
1314 int faili = 0;
1315 int refi = 0;
1316 int slot;
1317 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2296 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1318 u64, u64, u64, u64, u64, u64, u64, u64, u64); 2297 u64, u64, u64, u64, u64, u64);
1319 2298
1320 ref_root = btrfs_header_owner(buf); 2299 ref_root = btrfs_header_owner(buf);
1321 ref_generation = btrfs_header_generation(buf);
1322 orig_root = btrfs_header_owner(orig_buf);
1323 orig_generation = btrfs_header_generation(orig_buf);
1324
1325 nritems = btrfs_header_nritems(buf); 2300 nritems = btrfs_header_nritems(buf);
1326 level = btrfs_header_level(buf); 2301 level = btrfs_header_level(buf);
1327 2302
1328 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); 2303 if (!root->ref_cows && level == 0)
1329 BUG_ON(!sorted); 2304 return 0;
1330 2305
1331 if (root->ref_cows) { 2306 if (inc)
1332 process_func = __btrfs_inc_extent_ref; 2307 process_func = btrfs_inc_extent_ref;
1333 } else { 2308 else
1334 if (level == 0 && 2309 process_func = btrfs_free_extent;
1335 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 2310
1336 goto out; 2311 if (full_backref)
1337 if (level != 0 && 2312 parent = buf->start;
1338 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) 2313 else
1339 goto out; 2314 parent = 0;
1340 process_func = __btrfs_update_extent_ref;
1341 }
1342 2315
1343 /*
1344 * we make two passes through the items. In the first pass we
1345 * only record the byte number and slot. Then we sort based on
1346 * byte number and do the actual work based on the sorted results
1347 */
1348 for (i = 0; i < nritems; i++) { 2316 for (i = 0; i < nritems; i++) {
1349 cond_resched();
1350 if (level == 0) { 2317 if (level == 0) {
1351 btrfs_item_key_to_cpu(buf, &key, i); 2318 btrfs_item_key_to_cpu(buf, &key, i);
1352 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2319 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -1360,151 +2327,38 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1360 if (bytenr == 0) 2327 if (bytenr == 0)
1361 continue; 2328 continue;
1362 2329
1363 nr_file_extents++; 2330 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
1364 sorted[refi].bytenr = bytenr; 2331 key.offset -= btrfs_file_extent_offset(buf, fi);
1365 sorted[refi].slot = i; 2332 ret = process_func(trans, root, bytenr, num_bytes,
1366 refi++; 2333 parent, ref_root, key.objectid,
1367 } else { 2334 key.offset);
1368 bytenr = btrfs_node_blockptr(buf, i); 2335 if (ret)
1369 sorted[refi].bytenr = bytenr;
1370 sorted[refi].slot = i;
1371 refi++;
1372 }
1373 }
1374 /*
1375 * if refi == 0, we didn't actually put anything into the sorted
1376 * array and we're done
1377 */
1378 if (refi == 0)
1379 goto out;
1380
1381 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1382
1383 for (i = 0; i < refi; i++) {
1384 cond_resched();
1385 slot = sorted[i].slot;
1386 bytenr = sorted[i].bytenr;
1387
1388 if (level == 0) {
1389 btrfs_item_key_to_cpu(buf, &key, slot);
1390 fi = btrfs_item_ptr(buf, slot,
1391 struct btrfs_file_extent_item);
1392
1393 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1394 if (bytenr == 0)
1395 continue;
1396
1397 ret = process_func(trans, root, bytenr,
1398 btrfs_file_extent_disk_num_bytes(buf, fi),
1399 orig_buf->start, buf->start,
1400 orig_root, ref_root,
1401 orig_generation, ref_generation,
1402 key.objectid);
1403
1404 if (ret) {
1405 faili = slot;
1406 WARN_ON(1);
1407 goto fail; 2336 goto fail;
1408 }
1409 } else { 2337 } else {
1410 ret = process_func(trans, root, bytenr, buf->len, 2338 bytenr = btrfs_node_blockptr(buf, i);
1411 orig_buf->start, buf->start, 2339 num_bytes = btrfs_level_size(root, level - 1);
1412 orig_root, ref_root, 2340 ret = process_func(trans, root, bytenr, num_bytes,
1413 orig_generation, ref_generation, 2341 parent, ref_root, level - 1, 0);
1414 level - 1); 2342 if (ret)
1415 if (ret) {
1416 faili = slot;
1417 WARN_ON(1);
1418 goto fail; 2343 goto fail;
1419 }
1420 } 2344 }
1421 } 2345 }
1422out:
1423 kfree(sorted);
1424 if (nr_extents) {
1425 if (level == 0)
1426 *nr_extents = nr_file_extents;
1427 else
1428 *nr_extents = nritems;
1429 }
1430 return 0; 2346 return 0;
1431fail: 2347fail:
1432 kfree(sorted); 2348 BUG();
1433 WARN_ON(1);
1434 return ret; 2349 return ret;
1435} 2350}
1436 2351
1437int btrfs_update_ref(struct btrfs_trans_handle *trans, 2352int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1438 struct btrfs_root *root, struct extent_buffer *orig_buf, 2353 struct extent_buffer *buf, int full_backref)
1439 struct extent_buffer *buf, int start_slot, int nr)
1440
1441{ 2354{
1442 u64 bytenr; 2355 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
1443 u64 ref_root; 2356}
1444 u64 orig_root;
1445 u64 ref_generation;
1446 u64 orig_generation;
1447 struct btrfs_key key;
1448 struct btrfs_file_extent_item *fi;
1449 int i;
1450 int ret;
1451 int slot;
1452 int level;
1453
1454 BUG_ON(start_slot < 0);
1455 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1456
1457 ref_root = btrfs_header_owner(buf);
1458 ref_generation = btrfs_header_generation(buf);
1459 orig_root = btrfs_header_owner(orig_buf);
1460 orig_generation = btrfs_header_generation(orig_buf);
1461 level = btrfs_header_level(buf);
1462
1463 if (!root->ref_cows) {
1464 if (level == 0 &&
1465 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1466 return 0;
1467 if (level != 0 &&
1468 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1469 return 0;
1470 }
1471 2357
1472 for (i = 0, slot = start_slot; i < nr; i++, slot++) { 2358int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1473 cond_resched(); 2359 struct extent_buffer *buf, int full_backref)
1474 if (level == 0) { 2360{
1475 btrfs_item_key_to_cpu(buf, &key, slot); 2361 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
1476 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1477 continue;
1478 fi = btrfs_item_ptr(buf, slot,
1479 struct btrfs_file_extent_item);
1480 if (btrfs_file_extent_type(buf, fi) ==
1481 BTRFS_FILE_EXTENT_INLINE)
1482 continue;
1483 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1484 if (bytenr == 0)
1485 continue;
1486 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1487 btrfs_file_extent_disk_num_bytes(buf, fi),
1488 orig_buf->start, buf->start,
1489 orig_root, ref_root, orig_generation,
1490 ref_generation, key.objectid);
1491 if (ret)
1492 goto fail;
1493 } else {
1494 bytenr = btrfs_node_blockptr(buf, slot);
1495 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1496 buf->len, orig_buf->start,
1497 buf->start, orig_root, ref_root,
1498 orig_generation, ref_generation,
1499 level - 1);
1500 if (ret)
1501 goto fail;
1502 }
1503 }
1504 return 0;
1505fail:
1506 WARN_ON(1);
1507 return -1;
1508} 2362}
1509 2363
1510static int write_one_cache_group(struct btrfs_trans_handle *trans, 2364static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2007,6 +2861,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2007 u64 old_val; 2861 u64 old_val;
2008 u64 byte_in_group; 2862 u64 byte_in_group;
2009 2863
2864 /* block accounting for super block */
2865 spin_lock(&info->delalloc_lock);
2866 old_val = btrfs_super_bytes_used(&info->super_copy);
2867 if (alloc)
2868 old_val += num_bytes;
2869 else
2870 old_val -= num_bytes;
2871 btrfs_set_super_bytes_used(&info->super_copy, old_val);
2872
2873 /* block accounting for root item */
2874 old_val = btrfs_root_used(&root->root_item);
2875 if (alloc)
2876 old_val += num_bytes;
2877 else
2878 old_val -= num_bytes;
2879 btrfs_set_root_used(&root->root_item, old_val);
2880 spin_unlock(&info->delalloc_lock);
2881
2010 while (total) { 2882 while (total) {
2011 cache = btrfs_lookup_block_group(info, bytenr); 2883 cache = btrfs_lookup_block_group(info, bytenr);
2012 if (!cache) 2884 if (!cache)
@@ -2216,8 +3088,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2216 u64 header_owner = btrfs_header_owner(buf); 3088 u64 header_owner = btrfs_header_owner(buf);
2217 u64 header_transid = btrfs_header_generation(buf); 3089 u64 header_transid = btrfs_header_generation(buf);
2218 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 3090 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2219 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2220 header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2221 header_transid == trans->transid && 3091 header_transid == trans->transid &&
2222 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 3092 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2223 *must_clean = buf; 3093 *must_clean = buf;
@@ -2235,63 +3105,77 @@ pinit:
2235 return 0; 3105 return 0;
2236} 3106}
2237 3107
2238/* 3108
2239 * remove an extent from the root, returns 0 on success 3109static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2240 */ 3110 struct btrfs_root *root,
2241static int __free_extent(struct btrfs_trans_handle *trans, 3111 u64 bytenr, u64 num_bytes, u64 parent,
2242 struct btrfs_root *root, 3112 u64 root_objectid, u64 owner_objectid,
2243 u64 bytenr, u64 num_bytes, u64 parent, 3113 u64 owner_offset, int refs_to_drop,
2244 u64 root_objectid, u64 ref_generation, 3114 struct btrfs_delayed_extent_op *extent_op)
2245 u64 owner_objectid, int pin, int mark_free,
2246 int refs_to_drop)
2247{ 3115{
2248 struct btrfs_path *path;
2249 struct btrfs_key key; 3116 struct btrfs_key key;
3117 struct btrfs_path *path;
2250 struct btrfs_fs_info *info = root->fs_info; 3118 struct btrfs_fs_info *info = root->fs_info;
2251 struct btrfs_root *extent_root = info->extent_root; 3119 struct btrfs_root *extent_root = info->extent_root;
2252 struct extent_buffer *leaf; 3120 struct extent_buffer *leaf;
3121 struct btrfs_extent_item *ei;
3122 struct btrfs_extent_inline_ref *iref;
2253 int ret; 3123 int ret;
3124 int is_data;
2254 int extent_slot = 0; 3125 int extent_slot = 0;
2255 int found_extent = 0; 3126 int found_extent = 0;
2256 int num_to_del = 1; 3127 int num_to_del = 1;
2257 struct btrfs_extent_item *ei; 3128 u32 item_size;
2258 u32 refs; 3129 u64 refs;
2259 3130
2260 key.objectid = bytenr;
2261 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2262 key.offset = num_bytes;
2263 path = btrfs_alloc_path(); 3131 path = btrfs_alloc_path();
2264 if (!path) 3132 if (!path)
2265 return -ENOMEM; 3133 return -ENOMEM;
2266 3134
2267 path->reada = 1; 3135 path->reada = 1;
2268 path->leave_spinning = 1; 3136 path->leave_spinning = 1;
2269 ret = lookup_extent_backref(trans, extent_root, path, 3137
2270 bytenr, parent, root_objectid, 3138 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2271 ref_generation, owner_objectid, 1); 3139 BUG_ON(!is_data && refs_to_drop != 1);
3140
3141 ret = lookup_extent_backref(trans, extent_root, path, &iref,
3142 bytenr, num_bytes, parent,
3143 root_objectid, owner_objectid,
3144 owner_offset);
2272 if (ret == 0) { 3145 if (ret == 0) {
2273 struct btrfs_key found_key;
2274 extent_slot = path->slots[0]; 3146 extent_slot = path->slots[0];
2275 while (extent_slot > 0) { 3147 while (extent_slot >= 0) {
2276 extent_slot--; 3148 btrfs_item_key_to_cpu(path->nodes[0], &key,
2277 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2278 extent_slot); 3149 extent_slot);
2279 if (found_key.objectid != bytenr) 3150 if (key.objectid != bytenr)
2280 break; 3151 break;
2281 if (found_key.type == BTRFS_EXTENT_ITEM_KEY && 3152 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
2282 found_key.offset == num_bytes) { 3153 key.offset == num_bytes) {
2283 found_extent = 1; 3154 found_extent = 1;
2284 break; 3155 break;
2285 } 3156 }
2286 if (path->slots[0] - extent_slot > 5) 3157 if (path->slots[0] - extent_slot > 5)
2287 break; 3158 break;
3159 extent_slot--;
2288 } 3160 }
3161#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3162 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
3163 if (found_extent && item_size < sizeof(*ei))
3164 found_extent = 0;
3165#endif
2289 if (!found_extent) { 3166 if (!found_extent) {
3167 BUG_ON(iref);
2290 ret = remove_extent_backref(trans, extent_root, path, 3168 ret = remove_extent_backref(trans, extent_root, path,
2291 refs_to_drop); 3169 NULL, refs_to_drop,
3170 is_data);
2292 BUG_ON(ret); 3171 BUG_ON(ret);
2293 btrfs_release_path(extent_root, path); 3172 btrfs_release_path(extent_root, path);
2294 path->leave_spinning = 1; 3173 path->leave_spinning = 1;
3174
3175 key.objectid = bytenr;
3176 key.type = BTRFS_EXTENT_ITEM_KEY;
3177 key.offset = num_bytes;
3178
2295 ret = btrfs_search_slot(trans, extent_root, 3179 ret = btrfs_search_slot(trans, extent_root,
2296 &key, path, -1, 1); 3180 &key, path, -1, 1);
2297 if (ret) { 3181 if (ret) {
@@ -2307,82 +3191,98 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2307 btrfs_print_leaf(extent_root, path->nodes[0]); 3191 btrfs_print_leaf(extent_root, path->nodes[0]);
2308 WARN_ON(1); 3192 WARN_ON(1);
2309 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 3193 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2310 "parent %llu root %llu gen %llu owner %llu\n", 3194 "parent %llu root %llu owner %llu offset %llu\n",
2311 (unsigned long long)bytenr, 3195 (unsigned long long)bytenr,
2312 (unsigned long long)parent, 3196 (unsigned long long)parent,
2313 (unsigned long long)root_objectid, 3197 (unsigned long long)root_objectid,
2314 (unsigned long long)ref_generation, 3198 (unsigned long long)owner_objectid,
2315 (unsigned long long)owner_objectid); 3199 (unsigned long long)owner_offset);
2316 } 3200 }
2317 3201
2318 leaf = path->nodes[0]; 3202 leaf = path->nodes[0];
3203 item_size = btrfs_item_size_nr(leaf, extent_slot);
3204#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3205 if (item_size < sizeof(*ei)) {
3206 BUG_ON(found_extent || extent_slot != path->slots[0]);
3207 ret = convert_extent_item_v0(trans, extent_root, path,
3208 owner_objectid, 0);
3209 BUG_ON(ret < 0);
3210
3211 btrfs_release_path(extent_root, path);
3212 path->leave_spinning = 1;
3213
3214 key.objectid = bytenr;
3215 key.type = BTRFS_EXTENT_ITEM_KEY;
3216 key.offset = num_bytes;
3217
3218 ret = btrfs_search_slot(trans, extent_root, &key, path,
3219 -1, 1);
3220 if (ret) {
3221 printk(KERN_ERR "umm, got %d back from search"
3222 ", was looking for %llu\n", ret,
3223 (unsigned long long)bytenr);
3224 btrfs_print_leaf(extent_root, path->nodes[0]);
3225 }
3226 BUG_ON(ret);
3227 extent_slot = path->slots[0];
3228 leaf = path->nodes[0];
3229 item_size = btrfs_item_size_nr(leaf, extent_slot);
3230 }
3231#endif
3232 BUG_ON(item_size < sizeof(*ei));
2319 ei = btrfs_item_ptr(leaf, extent_slot, 3233 ei = btrfs_item_ptr(leaf, extent_slot,
2320 struct btrfs_extent_item); 3234 struct btrfs_extent_item);
2321 refs = btrfs_extent_refs(leaf, ei); 3235 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2322 3236 struct btrfs_tree_block_info *bi;
2323 /* 3237 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
2324 * we're not allowed to delete the extent item if there 3238 bi = (struct btrfs_tree_block_info *)(ei + 1);
2325 * are other delayed ref updates pending 3239 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
2326 */ 3240 }
2327 3241
3242 refs = btrfs_extent_refs(leaf, ei);
2328 BUG_ON(refs < refs_to_drop); 3243 BUG_ON(refs < refs_to_drop);
2329 refs -= refs_to_drop; 3244 refs -= refs_to_drop;
2330 btrfs_set_extent_refs(leaf, ei, refs);
2331 btrfs_mark_buffer_dirty(leaf);
2332 3245
2333 if (refs == 0 && found_extent && 3246 if (refs > 0) {
2334 path->slots[0] == extent_slot + 1) { 3247 if (extent_op)
2335 struct btrfs_extent_ref *ref; 3248 __run_delayed_extent_op(extent_op, leaf, ei);
2336 ref = btrfs_item_ptr(leaf, path->slots[0], 3249 /*
2337 struct btrfs_extent_ref); 3250 * In the case of inline back ref, reference count will
2338 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop); 3251 * be updated by remove_extent_backref
2339 /* if the back ref and the extent are next to each other
2340 * they get deleted below in one shot
2341 */ 3252 */
2342 path->slots[0] = extent_slot; 3253 if (iref) {
2343 num_to_del = 2; 3254 BUG_ON(!found_extent);
2344 } else if (found_extent) { 3255 } else {
2345 /* otherwise delete the extent back ref */ 3256 btrfs_set_extent_refs(leaf, ei, refs);
2346 ret = remove_extent_backref(trans, extent_root, path, 3257 btrfs_mark_buffer_dirty(leaf);
2347 refs_to_drop); 3258 }
2348 BUG_ON(ret); 3259 if (found_extent) {
2349 /* if refs are 0, we need to setup the path for deletion */ 3260 ret = remove_extent_backref(trans, extent_root, path,
2350 if (refs == 0) { 3261 iref, refs_to_drop,
2351 btrfs_release_path(extent_root, path); 3262 is_data);
2352 path->leave_spinning = 1;
2353 ret = btrfs_search_slot(trans, extent_root, &key, path,
2354 -1, 1);
2355 BUG_ON(ret); 3263 BUG_ON(ret);
2356 } 3264 }
2357 } 3265 } else {
2358 3266 int mark_free = 0;
2359 if (refs == 0) {
2360 u64 super_used;
2361 u64 root_used;
2362 struct extent_buffer *must_clean = NULL; 3267 struct extent_buffer *must_clean = NULL;
2363 3268
2364 if (pin) { 3269 if (found_extent) {
2365 ret = pin_down_bytes(trans, root, path, 3270 BUG_ON(is_data && refs_to_drop !=
2366 bytenr, num_bytes, 3271 extent_data_ref_count(root, path, iref));
2367 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, 3272 if (iref) {
2368 &must_clean); 3273 BUG_ON(path->slots[0] != extent_slot);
2369 if (ret > 0) 3274 } else {
2370 mark_free = 1; 3275 BUG_ON(path->slots[0] != extent_slot + 1);
2371 BUG_ON(ret < 0); 3276 path->slots[0] = extent_slot;
3277 num_to_del = 2;
3278 }
2372 } 3279 }
2373 3280
2374 /* block accounting for super block */ 3281 ret = pin_down_bytes(trans, root, path, bytenr,
2375 spin_lock(&info->delalloc_lock); 3282 num_bytes, is_data, &must_clean);
2376 super_used = btrfs_super_bytes_used(&info->super_copy); 3283 if (ret > 0)
2377 btrfs_set_super_bytes_used(&info->super_copy, 3284 mark_free = 1;
2378 super_used - num_bytes); 3285 BUG_ON(ret < 0);
2379
2380 /* block accounting for root item */
2381 root_used = btrfs_root_used(&root->root_item);
2382 btrfs_set_root_used(&root->root_item,
2383 root_used - num_bytes);
2384 spin_unlock(&info->delalloc_lock);
2385
2386 /* 3286 /*
2387 * it is going to be very rare for someone to be waiting 3287 * it is going to be very rare for someone to be waiting
2388 * on the block we're freeing. del_items might need to 3288 * on the block we're freeing. del_items might need to
@@ -2403,7 +3303,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2403 free_extent_buffer(must_clean); 3303 free_extent_buffer(must_clean);
2404 } 3304 }
2405 3305
2406 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 3306 if (is_data) {
2407 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 3307 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2408 BUG_ON(ret); 3308 BUG_ON(ret);
2409 } else { 3309 } else {
@@ -2421,34 +3321,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2421} 3321}
2422 3322
2423/* 3323/*
2424 * remove an extent from the root, returns 0 on success
2425 */
2426static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2427 struct btrfs_root *root,
2428 u64 bytenr, u64 num_bytes, u64 parent,
2429 u64 root_objectid, u64 ref_generation,
2430 u64 owner_objectid, int pin,
2431 int refs_to_drop)
2432{
2433 WARN_ON(num_bytes < root->sectorsize);
2434
2435 /*
2436 * if metadata always pin
2437 * if data pin when any transaction has committed this
2438 */
2439 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2440 ref_generation != trans->transid)
2441 pin = 1;
2442
2443 if (ref_generation != trans->transid)
2444 pin = 1;
2445
2446 return __free_extent(trans, root, bytenr, num_bytes, parent,
2447 root_objectid, ref_generation,
2448 owner_objectid, pin, pin == 0, refs_to_drop);
2449}
2450
2451/*
2452 * when we free an extent, it is possible (and likely) that we free the last 3324 * when we free an extent, it is possible (and likely) that we free the last
2453 * delayed ref for that extent as well. This searches the delayed ref tree for 3325 * delayed ref for that extent as well. This searches the delayed ref tree for
2454 * a given extent, and if there are no other delayed refs to be processed, it 3326 * a given extent, and if there are no other delayed refs to be processed, it
@@ -2479,6 +3351,13 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2479 if (ref->bytenr == bytenr) 3351 if (ref->bytenr == bytenr)
2480 goto out; 3352 goto out;
2481 3353
3354 if (head->extent_op) {
3355 if (!head->must_insert_reserved)
3356 goto out;
3357 kfree(head->extent_op);
3358 head->extent_op = NULL;
3359 }
3360
2482 /* 3361 /*
2483 * waiting for the lock here would deadlock. If someone else has it 3362 * waiting for the lock here would deadlock. If someone else has it
2484 * locked they are already in the process of dropping it anyway 3363 * locked they are already in the process of dropping it anyway
@@ -2507,7 +3386,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2507 spin_unlock(&delayed_refs->lock); 3386 spin_unlock(&delayed_refs->lock);
2508 3387
2509 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 3388 ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
2510 &head->node, head->must_insert_reserved); 3389 &head->node, head->extent_op,
3390 head->must_insert_reserved);
2511 BUG_ON(ret); 3391 BUG_ON(ret);
2512 btrfs_put_delayed_ref(&head->node); 3392 btrfs_put_delayed_ref(&head->node);
2513 return 0; 3393 return 0;
@@ -2519,32 +3399,32 @@ out:
2519int btrfs_free_extent(struct btrfs_trans_handle *trans, 3399int btrfs_free_extent(struct btrfs_trans_handle *trans,
2520 struct btrfs_root *root, 3400 struct btrfs_root *root,
2521 u64 bytenr, u64 num_bytes, u64 parent, 3401 u64 bytenr, u64 num_bytes, u64 parent,
2522 u64 root_objectid, u64 ref_generation, 3402 u64 root_objectid, u64 owner, u64 offset)
2523 u64 owner_objectid, int pin)
2524{ 3403{
2525 int ret; 3404 int ret;
2526 3405
2527 /* 3406 /*
2528 * tree log blocks never actually go into the extent allocation 3407 * tree log blocks never actually go into the extent allocation
2529 * tree, just update pinning info and exit early. 3408 * tree, just update pinning info and exit early.
2530 *
2531 * data extents referenced by the tree log do need to have
2532 * their reference counts bumped.
2533 */ 3409 */
2534 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && 3410 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
2535 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 3411 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
2536 /* unlocks the pinned mutex */ 3412 /* unlocks the pinned mutex */
2537 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 3413 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2538 update_reserved_extents(root, bytenr, num_bytes, 0); 3414 update_reserved_extents(root, bytenr, num_bytes, 0);
2539 ret = 0; 3415 ret = 0;
2540 } else { 3416 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2541 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, 3417 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
2542 root_objectid, ref_generation, 3418 parent, root_objectid, (int)owner,
2543 owner_objectid, 3419 BTRFS_DROP_DELAYED_REF, NULL);
2544 BTRFS_DROP_DELAYED_REF, 1);
2545 BUG_ON(ret); 3420 BUG_ON(ret);
2546 ret = check_ref_cleanup(trans, root, bytenr); 3421 ret = check_ref_cleanup(trans, root, bytenr);
2547 BUG_ON(ret); 3422 BUG_ON(ret);
3423 } else {
3424 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
3425 parent, root_objectid, owner,
3426 offset, BTRFS_DROP_DELAYED_REF, NULL);
3427 BUG_ON(ret);
2548 } 3428 }
2549 return ret; 3429 return ret;
2550} 3430}
@@ -2719,7 +3599,7 @@ refill_cluster:
2719 last_ptr_loop = 0; 3599 last_ptr_loop = 0;
2720 3600
2721 /* allocate a cluster in this block group */ 3601 /* allocate a cluster in this block group */
2722 ret = btrfs_find_space_cluster(trans, 3602 ret = btrfs_find_space_cluster(trans, root,
2723 block_group, last_ptr, 3603 block_group, last_ptr,
2724 offset, num_bytes, 3604 offset, num_bytes,
2725 empty_cluster + empty_size); 3605 empty_cluster + empty_size);
@@ -2969,99 +3849,147 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2969 return ret; 3849 return ret;
2970} 3850}
2971 3851
2972static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 3852static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
2973 struct btrfs_root *root, u64 parent, 3853 struct btrfs_root *root,
2974 u64 root_objectid, u64 ref_generation, 3854 u64 parent, u64 root_objectid,
2975 u64 owner, struct btrfs_key *ins, 3855 u64 flags, u64 owner, u64 offset,
2976 int ref_mod) 3856 struct btrfs_key *ins, int ref_mod)
2977{ 3857{
2978 int ret; 3858 int ret;
2979 u64 super_used; 3859 struct btrfs_fs_info *fs_info = root->fs_info;
2980 u64 root_used;
2981 u64 num_bytes = ins->offset;
2982 u32 sizes[2];
2983 struct btrfs_fs_info *info = root->fs_info;
2984 struct btrfs_root *extent_root = info->extent_root;
2985 struct btrfs_extent_item *extent_item; 3860 struct btrfs_extent_item *extent_item;
2986 struct btrfs_extent_ref *ref; 3861 struct btrfs_extent_inline_ref *iref;
2987 struct btrfs_path *path; 3862 struct btrfs_path *path;
2988 struct btrfs_key keys[2]; 3863 struct extent_buffer *leaf;
2989 3864 int type;
2990 if (parent == 0) 3865 u32 size;
2991 parent = ins->objectid;
2992
2993 /* block accounting for super block */
2994 spin_lock(&info->delalloc_lock);
2995 super_used = btrfs_super_bytes_used(&info->super_copy);
2996 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
2997 3866
2998 /* block accounting for root item */ 3867 if (parent > 0)
2999 root_used = btrfs_root_used(&root->root_item); 3868 type = BTRFS_SHARED_DATA_REF_KEY;
3000 btrfs_set_root_used(&root->root_item, root_used + num_bytes); 3869 else
3001 spin_unlock(&info->delalloc_lock); 3870 type = BTRFS_EXTENT_DATA_REF_KEY;
3002 3871
3003 memcpy(&keys[0], ins, sizeof(*ins)); 3872 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
3004 keys[1].objectid = ins->objectid;
3005 keys[1].type = BTRFS_EXTENT_REF_KEY;
3006 keys[1].offset = parent;
3007 sizes[0] = sizeof(*extent_item);
3008 sizes[1] = sizeof(*ref);
3009 3873
3010 path = btrfs_alloc_path(); 3874 path = btrfs_alloc_path();
3011 BUG_ON(!path); 3875 BUG_ON(!path);
3012 3876
3013 path->leave_spinning = 1; 3877 path->leave_spinning = 1;
3014 ret = btrfs_insert_empty_items(trans, extent_root, path, keys, 3878 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
3015 sizes, 2); 3879 ins, size);
3016 BUG_ON(ret); 3880 BUG_ON(ret);
3017 3881
3018 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3882 leaf = path->nodes[0];
3883 extent_item = btrfs_item_ptr(leaf, path->slots[0],
3019 struct btrfs_extent_item); 3884 struct btrfs_extent_item);
3020 btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod); 3885 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
3021 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 3886 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
3022 struct btrfs_extent_ref); 3887 btrfs_set_extent_flags(leaf, extent_item,
3023 3888 flags | BTRFS_EXTENT_FLAG_DATA);
3024 btrfs_set_ref_root(path->nodes[0], ref, root_objectid); 3889
3025 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); 3890 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
3026 btrfs_set_ref_objectid(path->nodes[0], ref, owner); 3891 btrfs_set_extent_inline_ref_type(leaf, iref, type);
3027 btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod); 3892 if (parent > 0) {
3893 struct btrfs_shared_data_ref *ref;
3894 ref = (struct btrfs_shared_data_ref *)(iref + 1);
3895 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
3896 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
3897 } else {
3898 struct btrfs_extent_data_ref *ref;
3899 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3900 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
3901 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
3902 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
3903 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
3904 }
3028 3905
3029 btrfs_mark_buffer_dirty(path->nodes[0]); 3906 btrfs_mark_buffer_dirty(path->nodes[0]);
3030
3031 trans->alloc_exclude_start = 0;
3032 trans->alloc_exclude_nr = 0;
3033 btrfs_free_path(path); 3907 btrfs_free_path(path);
3034 3908
3035 if (ret) 3909 ret = update_block_group(trans, root, ins->objectid, ins->offset,
3036 goto out; 3910 1, 0);
3037
3038 ret = update_block_group(trans, root, ins->objectid,
3039 ins->offset, 1, 0);
3040 if (ret) { 3911 if (ret) {
3041 printk(KERN_ERR "btrfs update block group failed for %llu " 3912 printk(KERN_ERR "btrfs update block group failed for %llu "
3042 "%llu\n", (unsigned long long)ins->objectid, 3913 "%llu\n", (unsigned long long)ins->objectid,
3043 (unsigned long long)ins->offset); 3914 (unsigned long long)ins->offset);
3044 BUG(); 3915 BUG();
3045 } 3916 }
3046out:
3047 return ret; 3917 return ret;
3048} 3918}
3049 3919
3050int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 3920static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
3051 struct btrfs_root *root, u64 parent, 3921 struct btrfs_root *root,
3052 u64 root_objectid, u64 ref_generation, 3922 u64 parent, u64 root_objectid,
3053 u64 owner, struct btrfs_key *ins) 3923 u64 flags, struct btrfs_disk_key *key,
3924 int level, struct btrfs_key *ins)
3054{ 3925{
3055 int ret; 3926 int ret;
3927 struct btrfs_fs_info *fs_info = root->fs_info;
3928 struct btrfs_extent_item *extent_item;
3929 struct btrfs_tree_block_info *block_info;
3930 struct btrfs_extent_inline_ref *iref;
3931 struct btrfs_path *path;
3932 struct extent_buffer *leaf;
3933 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
3056 3934
3057 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) 3935 path = btrfs_alloc_path();
3058 return 0; 3936 BUG_ON(!path);
3059 3937
3060 ret = btrfs_add_delayed_ref(trans, ins->objectid, 3938 path->leave_spinning = 1;
3061 ins->offset, parent, root_objectid, 3939 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
3062 ref_generation, owner, 3940 ins, size);
3063 BTRFS_ADD_DELAYED_EXTENT, 0);
3064 BUG_ON(ret); 3941 BUG_ON(ret);
3942
3943 leaf = path->nodes[0];
3944 extent_item = btrfs_item_ptr(leaf, path->slots[0],
3945 struct btrfs_extent_item);
3946 btrfs_set_extent_refs(leaf, extent_item, 1);
3947 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
3948 btrfs_set_extent_flags(leaf, extent_item,
3949 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
3950 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
3951
3952 btrfs_set_tree_block_key(leaf, block_info, key);
3953 btrfs_set_tree_block_level(leaf, block_info, level);
3954
3955 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
3956 if (parent > 0) {
3957 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
3958 btrfs_set_extent_inline_ref_type(leaf, iref,
3959 BTRFS_SHARED_BLOCK_REF_KEY);
3960 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
3961 } else {
3962 btrfs_set_extent_inline_ref_type(leaf, iref,
3963 BTRFS_TREE_BLOCK_REF_KEY);
3964 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
3965 }
3966
3967 btrfs_mark_buffer_dirty(leaf);
3968 btrfs_free_path(path);
3969
3970 ret = update_block_group(trans, root, ins->objectid, ins->offset,
3971 1, 0);
3972 if (ret) {
3973 printk(KERN_ERR "btrfs update block group failed for %llu "
3974 "%llu\n", (unsigned long long)ins->objectid,
3975 (unsigned long long)ins->offset);
3976 BUG();
3977 }
3978 return ret;
3979}
3980
3981int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
3982 struct btrfs_root *root,
3983 u64 root_objectid, u64 owner,
3984 u64 offset, struct btrfs_key *ins)
3985{
3986 int ret;
3987
3988 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
3989
3990 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
3991 0, root_objectid, owner, offset,
3992 BTRFS_ADD_DELAYED_EXTENT, NULL);
3065 return ret; 3993 return ret;
3066} 3994}
3067 3995
@@ -3070,10 +3998,10 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3070 * an extent has been allocated and makes sure to clear the free 3998 * an extent has been allocated and makes sure to clear the free
3071 * space cache bits as well 3999 * space cache bits as well
3072 */ 4000 */
3073int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, 4001int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
3074 struct btrfs_root *root, u64 parent, 4002 struct btrfs_root *root,
3075 u64 root_objectid, u64 ref_generation, 4003 u64 root_objectid, u64 owner, u64 offset,
3076 u64 owner, struct btrfs_key *ins) 4004 struct btrfs_key *ins)
3077{ 4005{
3078 int ret; 4006 int ret;
3079 struct btrfs_block_group_cache *block_group; 4007 struct btrfs_block_group_cache *block_group;
@@ -3087,8 +4015,8 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3087 ins->offset); 4015 ins->offset);
3088 BUG_ON(ret); 4016 BUG_ON(ret);
3089 btrfs_put_block_group(block_group); 4017 btrfs_put_block_group(block_group);
3090 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 4018 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
3091 ref_generation, owner, ins, 1); 4019 0, owner, offset, ins, 1);
3092 return ret; 4020 return ret;
3093} 4021}
3094 4022
@@ -3099,26 +4027,48 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3099 * 4027 *
3100 * returns 0 if everything worked, non-zero otherwise. 4028 * returns 0 if everything worked, non-zero otherwise.
3101 */ 4029 */
3102int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 4030static int alloc_tree_block(struct btrfs_trans_handle *trans,
3103 struct btrfs_root *root, 4031 struct btrfs_root *root,
3104 u64 num_bytes, u64 parent, u64 min_alloc_size, 4032 u64 num_bytes, u64 parent, u64 root_objectid,
3105 u64 root_objectid, u64 ref_generation, 4033 struct btrfs_disk_key *key, int level,
3106 u64 owner_objectid, u64 empty_size, u64 hint_byte, 4034 u64 empty_size, u64 hint_byte, u64 search_end,
3107 u64 search_end, struct btrfs_key *ins, u64 data) 4035 struct btrfs_key *ins)
3108{ 4036{
3109 int ret; 4037 int ret;
3110 ret = __btrfs_reserve_extent(trans, root, num_bytes, 4038 u64 flags = 0;
3111 min_alloc_size, empty_size, hint_byte, 4039
3112 search_end, ins, data); 4040 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4041 empty_size, hint_byte, search_end,
4042 ins, 0);
3113 BUG_ON(ret); 4043 BUG_ON(ret);
4044
4045 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4046 if (parent == 0)
4047 parent = ins->objectid;
4048 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4049 } else
4050 BUG_ON(parent > 0);
4051
4052 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3114 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 4053 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3115 ret = btrfs_add_delayed_ref(trans, ins->objectid, 4054 struct btrfs_delayed_extent_op *extent_op;
3116 ins->offset, parent, root_objectid, 4055 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3117 ref_generation, owner_objectid, 4056 BUG_ON(!extent_op);
3118 BTRFS_ADD_DELAYED_EXTENT, 0); 4057 if (key)
4058 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4059 else
4060 memset(&extent_op->key, 0, sizeof(extent_op->key));
4061 extent_op->flags_to_set = flags;
4062 extent_op->update_key = 1;
4063 extent_op->update_flags = 1;
4064 extent_op->is_data = 0;
4065
4066 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4067 ins->offset, parent, root_objectid,
4068 level, BTRFS_ADD_DELAYED_EXTENT,
4069 extent_op);
3119 BUG_ON(ret); 4070 BUG_ON(ret);
3120 } 4071 }
3121 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3122 return ret; 4072 return ret;
3123} 4073}
3124 4074
@@ -3157,21 +4107,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3157 * returns the tree buffer or NULL. 4107 * returns the tree buffer or NULL.
3158 */ 4108 */
3159struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 4109struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3160 struct btrfs_root *root, 4110 struct btrfs_root *root, u32 blocksize,
3161 u32 blocksize, u64 parent, 4111 u64 parent, u64 root_objectid,
3162 u64 root_objectid, 4112 struct btrfs_disk_key *key, int level,
3163 u64 ref_generation, 4113 u64 hint, u64 empty_size)
3164 int level,
3165 u64 hint,
3166 u64 empty_size)
3167{ 4114{
3168 struct btrfs_key ins; 4115 struct btrfs_key ins;
3169 int ret; 4116 int ret;
3170 struct extent_buffer *buf; 4117 struct extent_buffer *buf;
3171 4118
3172 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, 4119 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
3173 root_objectid, ref_generation, level, 4120 key, level, empty_size, hint, (u64)-1, &ins);
3174 empty_size, hint, (u64)-1, &ins, 0);
3175 if (ret) { 4121 if (ret) {
3176 BUG_ON(ret > 0); 4122 BUG_ON(ret > 0);
3177 return ERR_PTR(ret); 4123 return ERR_PTR(ret);
@@ -3185,32 +4131,19 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3185int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 4131int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3186 struct btrfs_root *root, struct extent_buffer *leaf) 4132 struct btrfs_root *root, struct extent_buffer *leaf)
3187{ 4133{
3188 u64 leaf_owner; 4134 u64 disk_bytenr;
3189 u64 leaf_generation; 4135 u64 num_bytes;
3190 struct refsort *sorted;
3191 struct btrfs_key key; 4136 struct btrfs_key key;
3192 struct btrfs_file_extent_item *fi; 4137 struct btrfs_file_extent_item *fi;
4138 u32 nritems;
3193 int i; 4139 int i;
3194 int nritems;
3195 int ret; 4140 int ret;
3196 int refi = 0;
3197 int slot;
3198 4141
3199 BUG_ON(!btrfs_is_leaf(leaf)); 4142 BUG_ON(!btrfs_is_leaf(leaf));
3200 nritems = btrfs_header_nritems(leaf); 4143 nritems = btrfs_header_nritems(leaf);
3201 leaf_owner = btrfs_header_owner(leaf);
3202 leaf_generation = btrfs_header_generation(leaf);
3203 4144
3204 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3205 /* we do this loop twice. The first time we build a list
3206 * of the extents we have a reference on, then we sort the list
3207 * by bytenr. The second time around we actually do the
3208 * extent freeing.
3209 */
3210 for (i = 0; i < nritems; i++) { 4145 for (i = 0; i < nritems; i++) {
3211 u64 disk_bytenr;
3212 cond_resched(); 4146 cond_resched();
3213
3214 btrfs_item_key_to_cpu(leaf, &key, i); 4147 btrfs_item_key_to_cpu(leaf, &key, i);
3215 4148
3216 /* only extents have references, skip everything else */ 4149 /* only extents have references, skip everything else */
@@ -3230,45 +4163,16 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3230 if (disk_bytenr == 0) 4163 if (disk_bytenr == 0)
3231 continue; 4164 continue;
3232 4165
3233 sorted[refi].bytenr = disk_bytenr; 4166 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
3234 sorted[refi].slot = i; 4167 ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
3235 refi++; 4168 leaf->start, 0, key.objectid, 0);
3236 }
3237
3238 if (refi == 0)
3239 goto out;
3240
3241 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3242
3243 for (i = 0; i < refi; i++) {
3244 u64 disk_bytenr;
3245
3246 disk_bytenr = sorted[i].bytenr;
3247 slot = sorted[i].slot;
3248
3249 cond_resched();
3250
3251 btrfs_item_key_to_cpu(leaf, &key, slot);
3252 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3253 continue;
3254
3255 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3256
3257 ret = btrfs_free_extent(trans, root, disk_bytenr,
3258 btrfs_file_extent_disk_num_bytes(leaf, fi),
3259 leaf->start, leaf_owner, leaf_generation,
3260 key.objectid, 0);
3261 BUG_ON(ret); 4169 BUG_ON(ret);
3262
3263 atomic_inc(&root->fs_info->throttle_gen);
3264 wake_up(&root->fs_info->transaction_throttle);
3265 cond_resched();
3266 } 4170 }
3267out:
3268 kfree(sorted);
3269 return 0; 4171 return 0;
3270} 4172}
3271 4173
4174#if 0
4175
3272static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, 4176static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3273 struct btrfs_root *root, 4177 struct btrfs_root *root,
3274 struct btrfs_leaf_ref *ref) 4178 struct btrfs_leaf_ref *ref)
@@ -3311,13 +4215,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3311 return 0; 4215 return 0;
3312} 4216}
3313 4217
4218
3314static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, 4219static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3315 struct btrfs_root *root, u64 start, 4220 struct btrfs_root *root, u64 start,
3316 u64 len, u32 *refs) 4221 u64 len, u32 *refs)
3317{ 4222{
3318 int ret; 4223 int ret;
3319 4224
3320 ret = btrfs_lookup_extent_ref(trans, root, start, len, refs); 4225 ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
3321 BUG_ON(ret); 4226 BUG_ON(ret);
3322 4227
3323#if 0 /* some debugging code in case we see problems here */ 4228#if 0 /* some debugging code in case we see problems here */
@@ -3352,6 +4257,7 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3352 return ret; 4257 return ret;
3353} 4258}
3354 4259
4260
3355/* 4261/*
3356 * this is used while deleting old snapshots, and it drops the refs 4262 * this is used while deleting old snapshots, and it drops the refs
3357 * on a whole subtree starting from a level 1 node. 4263 * on a whole subtree starting from a level 1 node.
@@ -3645,32 +4551,36 @@ out:
3645 cond_resched(); 4551 cond_resched();
3646 return 0; 4552 return 0;
3647} 4553}
4554#endif
3648 4555
3649/* 4556/*
3650 * helper function for drop_subtree, this function is similar to 4557 * helper function for drop_subtree, this function is similar to
3651 * walk_down_tree. The main difference is that it checks reference 4558 * walk_down_tree. The main difference is that it checks reference
3652 * counts while tree blocks are locked. 4559 * counts while tree blocks are locked.
3653 */ 4560 */
3654static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, 4561static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3655 struct btrfs_root *root, 4562 struct btrfs_root *root,
3656 struct btrfs_path *path, int *level) 4563 struct btrfs_path *path, int *level)
3657{ 4564{
3658 struct extent_buffer *next; 4565 struct extent_buffer *next;
3659 struct extent_buffer *cur; 4566 struct extent_buffer *cur;
3660 struct extent_buffer *parent; 4567 struct extent_buffer *parent;
3661 u64 bytenr; 4568 u64 bytenr;
3662 u64 ptr_gen; 4569 u64 ptr_gen;
4570 u64 refs;
4571 u64 flags;
3663 u32 blocksize; 4572 u32 blocksize;
3664 u32 refs;
3665 int ret; 4573 int ret;
3666 4574
3667 cur = path->nodes[*level]; 4575 cur = path->nodes[*level];
3668 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, 4576 ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
3669 &refs); 4577 &refs, &flags);
3670 BUG_ON(ret); 4578 BUG_ON(ret);
3671 if (refs > 1) 4579 if (refs > 1)
3672 goto out; 4580 goto out;
3673 4581
4582 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4583
3674 while (*level >= 0) { 4584 while (*level >= 0) {
3675 cur = path->nodes[*level]; 4585 cur = path->nodes[*level];
3676 if (*level == 0) { 4586 if (*level == 0) {
@@ -3692,16 +4602,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3692 btrfs_tree_lock(next); 4602 btrfs_tree_lock(next);
3693 btrfs_set_lock_blocking(next); 4603 btrfs_set_lock_blocking(next);
3694 4604
3695 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 4605 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
3696 &refs); 4606 &refs, &flags);
3697 BUG_ON(ret); 4607 BUG_ON(ret);
3698 if (refs > 1) { 4608 if (refs > 1) {
3699 parent = path->nodes[*level]; 4609 parent = path->nodes[*level];
3700 ret = btrfs_free_extent(trans, root, bytenr, 4610 ret = btrfs_free_extent(trans, root, bytenr,
3701 blocksize, parent->start, 4611 blocksize, parent->start,
3702 btrfs_header_owner(parent), 4612 btrfs_header_owner(parent),
3703 btrfs_header_generation(parent), 4613 *level - 1, 0);
3704 *level - 1, 1);
3705 BUG_ON(ret); 4614 BUG_ON(ret);
3706 path->slots[*level]++; 4615 path->slots[*level]++;
3707 btrfs_tree_unlock(next); 4616 btrfs_tree_unlock(next);
@@ -3709,6 +4618,8 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3709 continue; 4618 continue;
3710 } 4619 }
3711 4620
4621 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4622
3712 *level = btrfs_header_level(next); 4623 *level = btrfs_header_level(next);
3713 path->nodes[*level] = next; 4624 path->nodes[*level] = next;
3714 path->slots[*level] = 0; 4625 path->slots[*level] = 0;
@@ -3716,13 +4627,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3716 cond_resched(); 4627 cond_resched();
3717 } 4628 }
3718out: 4629out:
3719 parent = path->nodes[*level + 1]; 4630 if (path->nodes[*level] == root->node)
4631 parent = path->nodes[*level];
4632 else
4633 parent = path->nodes[*level + 1];
3720 bytenr = path->nodes[*level]->start; 4634 bytenr = path->nodes[*level]->start;
3721 blocksize = path->nodes[*level]->len; 4635 blocksize = path->nodes[*level]->len;
3722 4636
3723 ret = btrfs_free_extent(trans, root, bytenr, blocksize, 4637 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
3724 parent->start, btrfs_header_owner(parent), 4638 btrfs_header_owner(parent), *level, 0);
3725 btrfs_header_generation(parent), *level, 1);
3726 BUG_ON(ret); 4639 BUG_ON(ret);
3727 4640
3728 if (path->locks[*level]) { 4641 if (path->locks[*level]) {
@@ -3746,8 +4659,6 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3746 struct btrfs_path *path, 4659 struct btrfs_path *path,
3747 int *level, int max_level) 4660 int *level, int max_level)
3748{ 4661{
3749 u64 root_owner;
3750 u64 root_gen;
3751 struct btrfs_root_item *root_item = &root->root_item; 4662 struct btrfs_root_item *root_item = &root->root_item;
3752 int i; 4663 int i;
3753 int slot; 4664 int slot;
@@ -3755,24 +4666,22 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3755 4666
3756 for (i = *level; i < max_level && path->nodes[i]; i++) { 4667 for (i = *level; i < max_level && path->nodes[i]; i++) {
3757 slot = path->slots[i]; 4668 slot = path->slots[i];
3758 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4669 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
3759 struct extent_buffer *node;
3760 struct btrfs_disk_key disk_key;
3761
3762 /* 4670 /*
3763 * there is more work to do in this level. 4671 * there is more work to do in this level.
3764 * Update the drop_progress marker to reflect 4672 * Update the drop_progress marker to reflect
3765 * the work we've done so far, and then bump 4673 * the work we've done so far, and then bump
3766 * the slot number 4674 * the slot number
3767 */ 4675 */
3768 node = path->nodes[i];
3769 path->slots[i]++; 4676 path->slots[i]++;
3770 *level = i;
3771 WARN_ON(*level == 0); 4677 WARN_ON(*level == 0);
3772 btrfs_node_key(node, &disk_key, path->slots[i]); 4678 if (max_level == BTRFS_MAX_LEVEL) {
3773 memcpy(&root_item->drop_progress, 4679 btrfs_node_key(path->nodes[i],
3774 &disk_key, sizeof(disk_key)); 4680 &root_item->drop_progress,
3775 root_item->drop_level = i; 4681 path->slots[i]);
4682 root_item->drop_level = i;
4683 }
4684 *level = i;
3776 return 0; 4685 return 0;
3777 } else { 4686 } else {
3778 struct extent_buffer *parent; 4687 struct extent_buffer *parent;
@@ -3786,22 +4695,20 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3786 else 4695 else
3787 parent = path->nodes[*level + 1]; 4696 parent = path->nodes[*level + 1];
3788 4697
3789 root_owner = btrfs_header_owner(parent); 4698 clean_tree_block(trans, root, path->nodes[i]);
3790 root_gen = btrfs_header_generation(parent);
3791
3792 clean_tree_block(trans, root, path->nodes[*level]);
3793 ret = btrfs_free_extent(trans, root, 4699 ret = btrfs_free_extent(trans, root,
3794 path->nodes[*level]->start, 4700 path->nodes[i]->start,
3795 path->nodes[*level]->len, 4701 path->nodes[i]->len,
3796 parent->start, root_owner, 4702 parent->start,
3797 root_gen, *level, 1); 4703 btrfs_header_owner(parent),
4704 *level, 0);
3798 BUG_ON(ret); 4705 BUG_ON(ret);
3799 if (path->locks[*level]) { 4706 if (path->locks[*level]) {
3800 btrfs_tree_unlock(path->nodes[*level]); 4707 btrfs_tree_unlock(path->nodes[i]);
3801 path->locks[*level] = 0; 4708 path->locks[i] = 0;
3802 } 4709 }
3803 free_extent_buffer(path->nodes[*level]); 4710 free_extent_buffer(path->nodes[i]);
3804 path->nodes[*level] = NULL; 4711 path->nodes[i] = NULL;
3805 *level = i + 1; 4712 *level = i + 1;
3806 } 4713 }
3807 } 4714 }
@@ -3820,21 +4727,18 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3820 int wret; 4727 int wret;
3821 int level; 4728 int level;
3822 struct btrfs_path *path; 4729 struct btrfs_path *path;
3823 int i;
3824 int orig_level;
3825 int update_count; 4730 int update_count;
3826 struct btrfs_root_item *root_item = &root->root_item; 4731 struct btrfs_root_item *root_item = &root->root_item;
3827 4732
3828 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3829 path = btrfs_alloc_path(); 4733 path = btrfs_alloc_path();
3830 BUG_ON(!path); 4734 BUG_ON(!path);
3831 4735
3832 level = btrfs_header_level(root->node); 4736 level = btrfs_header_level(root->node);
3833 orig_level = level;
3834 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 4737 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3835 path->nodes[level] = root->node; 4738 path->nodes[level] = btrfs_lock_root_node(root);
3836 extent_buffer_get(root->node); 4739 btrfs_set_lock_blocking(path->nodes[level]);
3837 path->slots[level] = 0; 4740 path->slots[level] = 0;
4741 path->locks[level] = 1;
3838 } else { 4742 } else {
3839 struct btrfs_key key; 4743 struct btrfs_key key;
3840 struct btrfs_disk_key found_key; 4744 struct btrfs_disk_key found_key;
@@ -3856,12 +4760,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3856 * unlock our path, this is safe because only this 4760 * unlock our path, this is safe because only this
3857 * function is allowed to delete this snapshot 4761 * function is allowed to delete this snapshot
3858 */ 4762 */
3859 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4763 btrfs_unlock_up_safe(path, 0);
3860 if (path->nodes[i] && path->locks[i]) {
3861 path->locks[i] = 0;
3862 btrfs_tree_unlock(path->nodes[i]);
3863 }
3864 }
3865 } 4764 }
3866 while (1) { 4765 while (1) {
3867 unsigned long update; 4766 unsigned long update;
@@ -3882,8 +4781,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3882 ret = -EAGAIN; 4781 ret = -EAGAIN;
3883 break; 4782 break;
3884 } 4783 }
3885 atomic_inc(&root->fs_info->throttle_gen);
3886 wake_up(&root->fs_info->transaction_throttle);
3887 for (update_count = 0; update_count < 16; update_count++) { 4784 for (update_count = 0; update_count < 16; update_count++) {
3888 update = trans->delayed_ref_updates; 4785 update = trans->delayed_ref_updates;
3889 trans->delayed_ref_updates = 0; 4786 trans->delayed_ref_updates = 0;
@@ -3893,12 +4790,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3893 break; 4790 break;
3894 } 4791 }
3895 } 4792 }
3896 for (i = 0; i <= orig_level; i++) {
3897 if (path->nodes[i]) {
3898 free_extent_buffer(path->nodes[i]);
3899 path->nodes[i] = NULL;
3900 }
3901 }
3902out: 4793out:
3903 btrfs_free_path(path); 4794 btrfs_free_path(path);
3904 return ret; 4795 return ret;
@@ -3931,7 +4822,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3931 path->slots[level] = 0; 4822 path->slots[level] = 0;
3932 4823
3933 while (1) { 4824 while (1) {
3934 wret = walk_down_subtree(trans, root, path, &level); 4825 wret = walk_down_tree(trans, root, path, &level);
3935 if (wret < 0) 4826 if (wret < 0)
3936 ret = wret; 4827 ret = wret;
3937 if (wret != 0) 4828 if (wret != 0)
@@ -3948,6 +4839,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3948 return ret; 4839 return ret;
3949} 4840}
3950 4841
4842#if 0
3951static unsigned long calc_ra(unsigned long start, unsigned long last, 4843static unsigned long calc_ra(unsigned long start, unsigned long last,
3952 unsigned long nr) 4844 unsigned long nr)
3953{ 4845{
@@ -5429,6 +6321,7 @@ out:
5429 kfree(ref_path); 6321 kfree(ref_path);
5430 return ret; 6322 return ret;
5431} 6323}
6324#endif
5432 6325
5433static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 6326static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5434{ 6327{
@@ -5477,7 +6370,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5477 u64 calc; 6370 u64 calc;
5478 6371
5479 spin_lock(&shrink_block_group->lock); 6372 spin_lock(&shrink_block_group->lock);
5480 if (btrfs_block_group_used(&shrink_block_group->item) > 0) { 6373 if (btrfs_block_group_used(&shrink_block_group->item) +
6374 shrink_block_group->reserved > 0) {
5481 spin_unlock(&shrink_block_group->lock); 6375 spin_unlock(&shrink_block_group->lock);
5482 6376
5483 trans = btrfs_start_transaction(root, 1); 6377 trans = btrfs_start_transaction(root, 1);
@@ -5502,6 +6396,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5502 return 0; 6396 return 0;
5503} 6397}
5504 6398
6399
6400int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
6401 struct btrfs_block_group_cache *group)
6402
6403{
6404 __alloc_chunk_for_shrink(root, group, 1);
6405 set_block_group_readonly(group);
6406 return 0;
6407}
6408
6409#if 0
5505static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 6410static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5506 struct btrfs_root *root, 6411 struct btrfs_root *root,
5507 u64 objectid, u64 size) 6412 u64 objectid, u64 size)
@@ -5781,6 +6686,7 @@ out:
5781 btrfs_free_path(path); 6686 btrfs_free_path(path);
5782 return ret; 6687 return ret;
5783} 6688}
6689#endif
5784 6690
5785static int find_first_block_group(struct btrfs_root *root, 6691static int find_first_block_group(struct btrfs_root *root,
5786 struct btrfs_path *path, struct btrfs_key *key) 6692 struct btrfs_path *path, struct btrfs_key *key)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fe9eb990e443..68260180f587 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -476,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
476 struct extent_state *state; 476 struct extent_state *state;
477 struct extent_state *prealloc = NULL; 477 struct extent_state *prealloc = NULL;
478 struct rb_node *node; 478 struct rb_node *node;
479 u64 last_end;
479 int err; 480 int err;
480 int set = 0; 481 int set = 0;
481 482
@@ -498,6 +499,7 @@ again:
498 if (state->start > end) 499 if (state->start > end)
499 goto out; 500 goto out;
500 WARN_ON(state->end < start); 501 WARN_ON(state->end < start);
502 last_end = state->end;
501 503
502 /* 504 /*
503 * | ---- desired range ---- | 505 * | ---- desired range ---- |
@@ -524,9 +526,11 @@ again:
524 if (err) 526 if (err)
525 goto out; 527 goto out;
526 if (state->end <= end) { 528 if (state->end <= end) {
527 start = state->end + 1;
528 set |= clear_state_bit(tree, state, bits, 529 set |= clear_state_bit(tree, state, bits,
529 wake, delete); 530 wake, delete);
531 if (last_end == (u64)-1)
532 goto out;
533 start = last_end + 1;
530 } else { 534 } else {
531 start = state->start; 535 start = state->start;
532 } 536 }
@@ -552,8 +556,10 @@ again:
552 goto out; 556 goto out;
553 } 557 }
554 558
555 start = state->end + 1;
556 set |= clear_state_bit(tree, state, bits, wake, delete); 559 set |= clear_state_bit(tree, state, bits, wake, delete);
560 if (last_end == (u64)-1)
561 goto out;
562 start = last_end + 1;
557 goto search_again; 563 goto search_again;
558 564
559out: 565out:
@@ -707,8 +713,10 @@ again:
707 goto out; 713 goto out;
708 } 714 }
709 set_state_bits(tree, state, bits); 715 set_state_bits(tree, state, bits);
710 start = state->end + 1;
711 merge_state(tree, state); 716 merge_state(tree, state);
717 if (last_end == (u64)-1)
718 goto out;
719 start = last_end + 1;
712 goto search_again; 720 goto search_again;
713 } 721 }
714 722
@@ -742,8 +750,10 @@ again:
742 goto out; 750 goto out;
743 if (state->end <= end) { 751 if (state->end <= end) {
744 set_state_bits(tree, state, bits); 752 set_state_bits(tree, state, bits);
745 start = state->end + 1;
746 merge_state(tree, state); 753 merge_state(tree, state);
754 if (last_end == (u64)-1)
755 goto out;
756 start = last_end + 1;
747 } else { 757 } else {
748 start = state->start; 758 start = state->start;
749 } 759 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1d51dc38bb49..126477eaecf5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -291,16 +291,12 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
291{ 291{
292 u64 extent_end = 0; 292 u64 extent_end = 0;
293 u64 search_start = start; 293 u64 search_start = start;
294 u64 leaf_start;
295 u64 ram_bytes = 0; 294 u64 ram_bytes = 0;
296 u64 orig_parent = 0;
297 u64 disk_bytenr = 0; 295 u64 disk_bytenr = 0;
298 u64 orig_locked_end = locked_end; 296 u64 orig_locked_end = locked_end;
299 u8 compression; 297 u8 compression;
300 u8 encryption; 298 u8 encryption;
301 u16 other_encoding = 0; 299 u16 other_encoding = 0;
302 u64 root_gen;
303 u64 root_owner;
304 struct extent_buffer *leaf; 300 struct extent_buffer *leaf;
305 struct btrfs_file_extent_item *extent; 301 struct btrfs_file_extent_item *extent;
306 struct btrfs_path *path; 302 struct btrfs_path *path;
@@ -340,9 +336,6 @@ next_slot:
340 bookend = 0; 336 bookend = 0;
341 found_extent = 0; 337 found_extent = 0;
342 found_inline = 0; 338 found_inline = 0;
343 leaf_start = 0;
344 root_gen = 0;
345 root_owner = 0;
346 compression = 0; 339 compression = 0;
347 encryption = 0; 340 encryption = 0;
348 extent = NULL; 341 extent = NULL;
@@ -417,9 +410,6 @@ next_slot:
417 if (found_extent) { 410 if (found_extent) {
418 read_extent_buffer(leaf, &old, (unsigned long)extent, 411 read_extent_buffer(leaf, &old, (unsigned long)extent,
419 sizeof(old)); 412 sizeof(old));
420 root_gen = btrfs_header_generation(leaf);
421 root_owner = btrfs_header_owner(leaf);
422 leaf_start = leaf->start;
423 } 413 }
424 414
425 if (end < extent_end && end >= key.offset) { 415 if (end < extent_end && end >= key.offset) {
@@ -443,14 +433,14 @@ next_slot:
443 } 433 }
444 locked_end = extent_end; 434 locked_end = extent_end;
445 } 435 }
446 orig_parent = path->nodes[0]->start;
447 disk_bytenr = le64_to_cpu(old.disk_bytenr); 436 disk_bytenr = le64_to_cpu(old.disk_bytenr);
448 if (disk_bytenr != 0) { 437 if (disk_bytenr != 0) {
449 ret = btrfs_inc_extent_ref(trans, root, 438 ret = btrfs_inc_extent_ref(trans, root,
450 disk_bytenr, 439 disk_bytenr,
451 le64_to_cpu(old.disk_num_bytes), 440 le64_to_cpu(old.disk_num_bytes), 0,
452 orig_parent, root->root_key.objectid, 441 root->root_key.objectid,
453 trans->transid, inode->i_ino); 442 key.objectid, key.offset -
443 le64_to_cpu(old.offset));
454 BUG_ON(ret); 444 BUG_ON(ret);
455 } 445 }
456 } 446 }
@@ -568,17 +558,6 @@ next_slot:
568 btrfs_mark_buffer_dirty(path->nodes[0]); 558 btrfs_mark_buffer_dirty(path->nodes[0]);
569 btrfs_set_lock_blocking(path->nodes[0]); 559 btrfs_set_lock_blocking(path->nodes[0]);
570 560
571 if (disk_bytenr != 0) {
572 ret = btrfs_update_extent_ref(trans, root,
573 disk_bytenr,
574 le64_to_cpu(old.disk_num_bytes),
575 orig_parent,
576 leaf->start,
577 root->root_key.objectid,
578 trans->transid, ins.objectid);
579
580 BUG_ON(ret);
581 }
582 path->leave_spinning = 0; 561 path->leave_spinning = 0;
583 btrfs_release_path(root, path); 562 btrfs_release_path(root, path);
584 if (disk_bytenr != 0) 563 if (disk_bytenr != 0)
@@ -594,8 +573,9 @@ next_slot:
594 ret = btrfs_free_extent(trans, root, 573 ret = btrfs_free_extent(trans, root,
595 old_disk_bytenr, 574 old_disk_bytenr,
596 le64_to_cpu(old.disk_num_bytes), 575 le64_to_cpu(old.disk_num_bytes),
597 leaf_start, root_owner, 576 0, root->root_key.objectid,
598 root_gen, key.objectid, 0); 577 key.objectid, key.offset -
578 le64_to_cpu(old.offset));
599 BUG_ON(ret); 579 BUG_ON(ret);
600 *hint_byte = old_disk_bytenr; 580 *hint_byte = old_disk_bytenr;
601 } 581 }
@@ -664,12 +644,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
664 u64 bytenr; 644 u64 bytenr;
665 u64 num_bytes; 645 u64 num_bytes;
666 u64 extent_end; 646 u64 extent_end;
667 u64 extent_offset; 647 u64 orig_offset;
668 u64 other_start; 648 u64 other_start;
669 u64 other_end; 649 u64 other_end;
670 u64 split = start; 650 u64 split = start;
671 u64 locked_end = end; 651 u64 locked_end = end;
672 u64 orig_parent;
673 int extent_type; 652 int extent_type;
674 int split_end = 1; 653 int split_end = 1;
675 int ret; 654 int ret;
@@ -703,7 +682,7 @@ again:
703 682
704 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 683 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
705 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 684 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
706 extent_offset = btrfs_file_extent_offset(leaf, fi); 685 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
707 686
708 if (key.offset == start) 687 if (key.offset == start)
709 split = end; 688 split = end;
@@ -711,8 +690,6 @@ again:
711 if (key.offset == start && extent_end == end) { 690 if (key.offset == start && extent_end == end) {
712 int del_nr = 0; 691 int del_nr = 0;
713 int del_slot = 0; 692 int del_slot = 0;
714 u64 leaf_owner = btrfs_header_owner(leaf);
715 u64 leaf_gen = btrfs_header_generation(leaf);
716 other_start = end; 693 other_start = end;
717 other_end = 0; 694 other_end = 0;
718 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, 695 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
@@ -721,8 +698,8 @@ again:
721 del_slot = path->slots[0] + 1; 698 del_slot = path->slots[0] + 1;
722 del_nr++; 699 del_nr++;
723 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 700 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
724 leaf->start, leaf_owner, 701 0, root->root_key.objectid,
725 leaf_gen, inode->i_ino, 0); 702 inode->i_ino, orig_offset);
726 BUG_ON(ret); 703 BUG_ON(ret);
727 } 704 }
728 other_start = 0; 705 other_start = 0;
@@ -733,8 +710,8 @@ again:
733 del_slot = path->slots[0]; 710 del_slot = path->slots[0];
734 del_nr++; 711 del_nr++;
735 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 712 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
736 leaf->start, leaf_owner, 713 0, root->root_key.objectid,
737 leaf_gen, inode->i_ino, 0); 714 inode->i_ino, orig_offset);
738 BUG_ON(ret); 715 BUG_ON(ret);
739 } 716 }
740 split_end = 0; 717 split_end = 0;
@@ -768,13 +745,12 @@ again:
768 locked_end = extent_end; 745 locked_end = extent_end;
769 } 746 }
770 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); 747 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
771 extent_offset += split - key.offset;
772 } else { 748 } else {
773 BUG_ON(key.offset != start); 749 BUG_ON(key.offset != start);
774 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
775 split - key.offset);
776 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
777 key.offset = split; 750 key.offset = split;
751 btrfs_set_file_extent_offset(leaf, fi, key.offset -
752 orig_offset);
753 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
778 btrfs_set_item_key_safe(trans, root, path, &key); 754 btrfs_set_item_key_safe(trans, root, path, &key);
779 extent_end = split; 755 extent_end = split;
780 } 756 }
@@ -793,7 +769,8 @@ again:
793 struct btrfs_file_extent_item); 769 struct btrfs_file_extent_item);
794 key.offset = split; 770 key.offset = split;
795 btrfs_set_item_key_safe(trans, root, path, &key); 771 btrfs_set_item_key_safe(trans, root, path, &key);
796 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 772 btrfs_set_file_extent_offset(leaf, fi, key.offset -
773 orig_offset);
797 btrfs_set_file_extent_num_bytes(leaf, fi, 774 btrfs_set_file_extent_num_bytes(leaf, fi,
798 other_end - split); 775 other_end - split);
799 goto done; 776 goto done;
@@ -815,10 +792,9 @@ again:
815 792
816 btrfs_mark_buffer_dirty(leaf); 793 btrfs_mark_buffer_dirty(leaf);
817 794
818 orig_parent = leaf->start; 795 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
819 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 796 root->root_key.objectid,
820 orig_parent, root->root_key.objectid, 797 inode->i_ino, orig_offset);
821 trans->transid, inode->i_ino);
822 BUG_ON(ret); 798 BUG_ON(ret);
823 btrfs_release_path(root, path); 799 btrfs_release_path(root, path);
824 800
@@ -833,20 +809,12 @@ again:
833 btrfs_set_file_extent_type(leaf, fi, extent_type); 809 btrfs_set_file_extent_type(leaf, fi, extent_type);
834 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); 810 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
835 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); 811 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
836 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 812 btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
837 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); 813 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
838 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 814 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
839 btrfs_set_file_extent_compression(leaf, fi, 0); 815 btrfs_set_file_extent_compression(leaf, fi, 0);
840 btrfs_set_file_extent_encryption(leaf, fi, 0); 816 btrfs_set_file_extent_encryption(leaf, fi, 0);
841 btrfs_set_file_extent_other_encoding(leaf, fi, 0); 817 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
842
843 if (orig_parent != leaf->start) {
844 ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
845 orig_parent, leaf->start,
846 root->root_key.objectid,
847 trans->transid, inode->i_ino);
848 BUG_ON(ret);
849 }
850done: 818done:
851 btrfs_mark_buffer_dirty(leaf); 819 btrfs_mark_buffer_dirty(leaf);
852 820
@@ -1189,6 +1157,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1189 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1157 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1190 root->log_batch++; 1158 root->log_batch++;
1191 1159
1160 if (datasync && !(inode->i_state & I_DIRTY_PAGES))
1161 goto out;
1192 /* 1162 /*
1193 * ok we haven't committed the transaction yet, lets do a commit 1163 * ok we haven't committed the transaction yet, lets do a commit
1194 */ 1164 */
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0bc93657b460..4538e48581a5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -579,6 +579,7 @@ out:
579 * it returns -enospc 579 * it returns -enospc
580 */ 580 */
581int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 581int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
582 struct btrfs_root *root,
582 struct btrfs_block_group_cache *block_group, 583 struct btrfs_block_group_cache *block_group,
583 struct btrfs_free_cluster *cluster, 584 struct btrfs_free_cluster *cluster,
584 u64 offset, u64 bytes, u64 empty_size) 585 u64 offset, u64 bytes, u64 empty_size)
@@ -595,7 +596,9 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
595 int ret; 596 int ret;
596 597
597 /* for metadata, allow allocates with more holes */ 598 /* for metadata, allow allocates with more holes */
598 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 599 if (btrfs_test_opt(root, SSD_SPREAD)) {
600 min_bytes = bytes + empty_size;
601 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
599 /* 602 /*
600 * we want to do larger allocations when we are 603 * we want to do larger allocations when we are
601 * flushing out the delayed refs, it helps prevent 604 * flushing out the delayed refs, it helps prevent
@@ -645,14 +648,15 @@ again:
645 * we haven't filled the empty size and the window is 648 * we haven't filled the empty size and the window is
646 * very large. reset and try again 649 * very large. reset and try again
647 */ 650 */
648 if (next->offset - window_start > (bytes + empty_size) * 2) { 651 if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
652 next->offset - window_start > (bytes + empty_size) * 2) {
649 entry = next; 653 entry = next;
650 window_start = entry->offset; 654 window_start = entry->offset;
651 window_free = entry->bytes; 655 window_free = entry->bytes;
652 last = entry; 656 last = entry;
653 max_extent = 0; 657 max_extent = 0;
654 total_retries++; 658 total_retries++;
655 if (total_retries % 256 == 0) { 659 if (total_retries % 64 == 0) {
656 if (min_bytes >= (bytes + empty_size)) { 660 if (min_bytes >= (bytes + empty_size)) {
657 ret = -ENOSPC; 661 ret = -ENOSPC;
658 goto out; 662 goto out;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ab0bdc0a63ce..266fb8764054 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -31,6 +31,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes); 31 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); 32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root,
34 struct btrfs_block_group_cache *block_group, 35 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster, 36 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size); 37 u64 offset, u64 bytes, u64 empty_size);
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 2a020b276768..db2ff9773b99 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,9 +19,9 @@
19#ifndef __HASH__ 19#ifndef __HASH__
20#define __HASH__ 20#define __HASH__
21 21
22#include "crc32c.h" 22#include <linux/crc32c.h>
23static inline u64 btrfs_name_hash(const char *name, int len) 23static inline u64 btrfs_name_hash(const char *name, int len)
24{ 24{
25 return btrfs_crc32c((u32)~1, name, len); 25 return crc32c((u32)~1, name, len);
26} 26}
27#endif 27#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c8b0190d031..8612b3a09811 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -48,7 +48,6 @@
48#include "ordered-data.h" 48#include "ordered-data.h"
49#include "xattr.h" 49#include "xattr.h"
50#include "tree-log.h" 50#include "tree-log.h"
51#include "ref-cache.h"
52#include "compression.h" 51#include "compression.h"
53#include "locking.h" 52#include "locking.h"
54 53
@@ -369,7 +368,7 @@ again:
369 * inode has not been flagged as nocompress. This flag can 368 * inode has not been flagged as nocompress. This flag can
370 * change at any time if we discover bad compression ratios. 369 * change at any time if we discover bad compression ratios.
371 */ 370 */
372 if (!btrfs_test_flag(inode, NOCOMPRESS) && 371 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
373 btrfs_test_opt(root, COMPRESS)) { 372 btrfs_test_opt(root, COMPRESS)) {
374 WARN_ON(pages); 373 WARN_ON(pages);
375 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 374 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -470,7 +469,7 @@ again:
470 nr_pages_ret = 0; 469 nr_pages_ret = 0;
471 470
472 /* flag the file so we don't compress in the future */ 471 /* flag the file so we don't compress in the future */
473 btrfs_set_flag(inode, NOCOMPRESS); 472 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
474 } 473 }
475 if (will_compress) { 474 if (will_compress) {
476 *num_added += 1; 475 *num_added += 1;
@@ -863,7 +862,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
863 async_cow->locked_page = locked_page; 862 async_cow->locked_page = locked_page;
864 async_cow->start = start; 863 async_cow->start = start;
865 864
866 if (btrfs_test_flag(inode, NOCOMPRESS)) 865 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
867 cur_end = end; 866 cur_end = end;
868 else 867 else
869 cur_end = min(end, start + 512 * 1024 - 1); 868 cur_end = min(end, start + 512 * 1024 - 1);
@@ -944,6 +943,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
944 u64 cow_start; 943 u64 cow_start;
945 u64 cur_offset; 944 u64 cur_offset;
946 u64 extent_end; 945 u64 extent_end;
946 u64 extent_offset;
947 u64 disk_bytenr; 947 u64 disk_bytenr;
948 u64 num_bytes; 948 u64 num_bytes;
949 int extent_type; 949 int extent_type;
@@ -1005,6 +1005,7 @@ next_slot:
1005 if (extent_type == BTRFS_FILE_EXTENT_REG || 1005 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1006 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1006 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1007 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1007 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1008 extent_offset = btrfs_file_extent_offset(leaf, fi);
1008 extent_end = found_key.offset + 1009 extent_end = found_key.offset +
1009 btrfs_file_extent_num_bytes(leaf, fi); 1010 btrfs_file_extent_num_bytes(leaf, fi);
1010 if (extent_end <= start) { 1011 if (extent_end <= start) {
@@ -1022,9 +1023,10 @@ next_slot:
1022 if (btrfs_extent_readonly(root, disk_bytenr)) 1023 if (btrfs_extent_readonly(root, disk_bytenr))
1023 goto out_check; 1024 goto out_check;
1024 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 1025 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1025 disk_bytenr)) 1026 found_key.offset -
1027 extent_offset, disk_bytenr))
1026 goto out_check; 1028 goto out_check;
1027 disk_bytenr += btrfs_file_extent_offset(leaf, fi); 1029 disk_bytenr += extent_offset;
1028 disk_bytenr += cur_offset - found_key.offset; 1030 disk_bytenr += cur_offset - found_key.offset;
1029 num_bytes = min(end + 1, extent_end) - cur_offset; 1031 num_bytes = min(end + 1, extent_end) - cur_offset;
1030 /* 1032 /*
@@ -1131,10 +1133,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1131 int ret; 1133 int ret;
1132 struct btrfs_root *root = BTRFS_I(inode)->root; 1134 struct btrfs_root *root = BTRFS_I(inode)->root;
1133 1135
1134 if (btrfs_test_flag(inode, NODATACOW)) 1136 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
1135 ret = run_delalloc_nocow(inode, locked_page, start, end, 1137 ret = run_delalloc_nocow(inode, locked_page, start, end,
1136 page_started, 1, nr_written); 1138 page_started, 1, nr_written);
1137 else if (btrfs_test_flag(inode, PREALLOC)) 1139 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1138 ret = run_delalloc_nocow(inode, locked_page, start, end, 1140 ret = run_delalloc_nocow(inode, locked_page, start, end,
1139 page_started, 0, nr_written); 1141 page_started, 0, nr_written);
1140 else if (!btrfs_test_opt(root, COMPRESS)) 1142 else if (!btrfs_test_opt(root, COMPRESS))
@@ -1288,7 +1290,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1288 int ret = 0; 1290 int ret = 0;
1289 int skip_sum; 1291 int skip_sum;
1290 1292
1291 skip_sum = btrfs_test_flag(inode, NODATASUM); 1293 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1292 1294
1293 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1295 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1294 BUG_ON(ret); 1296 BUG_ON(ret);
@@ -1489,9 +1491,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1489 ins.objectid = disk_bytenr; 1491 ins.objectid = disk_bytenr;
1490 ins.offset = disk_num_bytes; 1492 ins.offset = disk_num_bytes;
1491 ins.type = BTRFS_EXTENT_ITEM_KEY; 1493 ins.type = BTRFS_EXTENT_ITEM_KEY;
1492 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, 1494 ret = btrfs_alloc_reserved_file_extent(trans, root,
1493 root->root_key.objectid, 1495 root->root_key.objectid,
1494 trans->transid, inode->i_ino, &ins); 1496 inode->i_ino, file_pos, &ins);
1495 BUG_ON(ret); 1497 BUG_ON(ret);
1496 btrfs_free_path(path); 1498 btrfs_free_path(path);
1497 1499
@@ -1788,7 +1790,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1788 ClearPageChecked(page); 1790 ClearPageChecked(page);
1789 goto good; 1791 goto good;
1790 } 1792 }
1791 if (btrfs_test_flag(inode, NODATASUM)) 1793
1794 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
1792 return 0; 1795 return 0;
1793 1796
1794 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1797 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
@@ -1956,23 +1959,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
1956 * crossing root thing. we store the inode number in the 1959 * crossing root thing. we store the inode number in the
1957 * offset of the orphan item. 1960 * offset of the orphan item.
1958 */ 1961 */
1959 inode = btrfs_iget_locked(root->fs_info->sb, 1962 found_key.objectid = found_key.offset;
1960 found_key.offset, root); 1963 found_key.type = BTRFS_INODE_ITEM_KEY;
1961 if (!inode) 1964 found_key.offset = 0;
1965 inode = btrfs_iget(root->fs_info->sb, &found_key, root);
1966 if (IS_ERR(inode))
1962 break; 1967 break;
1963 1968
1964 if (inode->i_state & I_NEW) {
1965 BTRFS_I(inode)->root = root;
1966
1967 /* have to set the location manually */
1968 BTRFS_I(inode)->location.objectid = inode->i_ino;
1969 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1970 BTRFS_I(inode)->location.offset = 0;
1971
1972 btrfs_read_locked_inode(inode);
1973 unlock_new_inode(inode);
1974 }
1975
1976 /* 1969 /*
1977 * add this inode to the orphan list so btrfs_orphan_del does 1970 * add this inode to the orphan list so btrfs_orphan_del does
1978 * the proper thing when we hit it 1971 * the proper thing when we hit it
@@ -2069,7 +2062,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
2069/* 2062/*
2070 * read an inode from the btree into the in-memory inode 2063 * read an inode from the btree into the in-memory inode
2071 */ 2064 */
2072void btrfs_read_locked_inode(struct inode *inode) 2065static void btrfs_read_locked_inode(struct inode *inode)
2073{ 2066{
2074 struct btrfs_path *path; 2067 struct btrfs_path *path;
2075 struct extent_buffer *leaf; 2068 struct extent_buffer *leaf;
@@ -2164,6 +2157,8 @@ void btrfs_read_locked_inode(struct inode *inode)
2164 init_special_inode(inode, inode->i_mode, rdev); 2157 init_special_inode(inode, inode->i_mode, rdev);
2165 break; 2158 break;
2166 } 2159 }
2160
2161 btrfs_update_iflags(inode);
2167 return; 2162 return;
2168 2163
2169make_bad: 2164make_bad:
@@ -2327,7 +2322,6 @@ err:
2327 btrfs_update_inode(trans, root, dir); 2322 btrfs_update_inode(trans, root, dir);
2328 btrfs_drop_nlink(inode); 2323 btrfs_drop_nlink(inode);
2329 ret = btrfs_update_inode(trans, root, inode); 2324 ret = btrfs_update_inode(trans, root, inode);
2330 dir->i_sb->s_dirt = 1;
2331out: 2325out:
2332 return ret; 2326 return ret;
2333} 2327}
@@ -2599,9 +2593,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2599 struct btrfs_file_extent_item *fi; 2593 struct btrfs_file_extent_item *fi;
2600 u64 extent_start = 0; 2594 u64 extent_start = 0;
2601 u64 extent_num_bytes = 0; 2595 u64 extent_num_bytes = 0;
2596 u64 extent_offset = 0;
2602 u64 item_end = 0; 2597 u64 item_end = 0;
2603 u64 root_gen = 0;
2604 u64 root_owner = 0;
2605 int found_extent; 2598 int found_extent;
2606 int del_item; 2599 int del_item;
2607 int pending_del_nr = 0; 2600 int pending_del_nr = 0;
@@ -2716,6 +2709,9 @@ search_again:
2716 extent_num_bytes = 2709 extent_num_bytes =
2717 btrfs_file_extent_disk_num_bytes(leaf, 2710 btrfs_file_extent_disk_num_bytes(leaf,
2718 fi); 2711 fi);
2712 extent_offset = found_key.offset -
2713 btrfs_file_extent_offset(leaf, fi);
2714
2719 /* FIXME blocksize != 4096 */ 2715 /* FIXME blocksize != 4096 */
2720 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 2716 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2721 if (extent_start != 0) { 2717 if (extent_start != 0) {
@@ -2723,8 +2719,6 @@ search_again:
2723 if (root->ref_cows) 2719 if (root->ref_cows)
2724 inode_sub_bytes(inode, num_dec); 2720 inode_sub_bytes(inode, num_dec);
2725 } 2721 }
2726 root_gen = btrfs_header_generation(leaf);
2727 root_owner = btrfs_header_owner(leaf);
2728 } 2722 }
2729 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2723 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2730 /* 2724 /*
@@ -2768,12 +2762,12 @@ delete:
2768 } else { 2762 } else {
2769 break; 2763 break;
2770 } 2764 }
2771 if (found_extent) { 2765 if (found_extent && root->ref_cows) {
2772 btrfs_set_path_blocking(path); 2766 btrfs_set_path_blocking(path);
2773 ret = btrfs_free_extent(trans, root, extent_start, 2767 ret = btrfs_free_extent(trans, root, extent_start,
2774 extent_num_bytes, 2768 extent_num_bytes, 0,
2775 leaf->start, root_owner, 2769 btrfs_header_owner(leaf),
2776 root_gen, inode->i_ino, 0); 2770 inode->i_ino, extent_offset);
2777 BUG_ON(ret); 2771 BUG_ON(ret);
2778 } 2772 }
2779next: 2773next:
@@ -2811,7 +2805,6 @@ error:
2811 pending_del_nr); 2805 pending_del_nr);
2812 } 2806 }
2813 btrfs_free_path(path); 2807 btrfs_free_path(path);
2814 inode->i_sb->s_dirt = 1;
2815 return ret; 2808 return ret;
2816} 2809}
2817 2810
@@ -3105,6 +3098,45 @@ static int fixup_tree_root_location(struct btrfs_root *root,
3105 return 0; 3098 return 0;
3106} 3099}
3107 3100
3101static void inode_tree_add(struct inode *inode)
3102{
3103 struct btrfs_root *root = BTRFS_I(inode)->root;
3104 struct btrfs_inode *entry;
3105 struct rb_node **p = &root->inode_tree.rb_node;
3106 struct rb_node *parent = NULL;
3107
3108 spin_lock(&root->inode_lock);
3109 while (*p) {
3110 parent = *p;
3111 entry = rb_entry(parent, struct btrfs_inode, rb_node);
3112
3113 if (inode->i_ino < entry->vfs_inode.i_ino)
3114 p = &(*p)->rb_left;
3115 else if (inode->i_ino > entry->vfs_inode.i_ino)
3116 p = &(*p)->rb_right;
3117 else {
3118 WARN_ON(!(entry->vfs_inode.i_state &
3119 (I_WILL_FREE | I_FREEING | I_CLEAR)));
3120 break;
3121 }
3122 }
3123 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
3124 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3125 spin_unlock(&root->inode_lock);
3126}
3127
3128static void inode_tree_del(struct inode *inode)
3129{
3130 struct btrfs_root *root = BTRFS_I(inode)->root;
3131
3132 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
3133 spin_lock(&root->inode_lock);
3134 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3135 spin_unlock(&root->inode_lock);
3136 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3137 }
3138}
3139
3108static noinline void init_btrfs_i(struct inode *inode) 3140static noinline void init_btrfs_i(struct inode *inode)
3109{ 3141{
3110 struct btrfs_inode *bi = BTRFS_I(inode); 3142 struct btrfs_inode *bi = BTRFS_I(inode);
@@ -3130,6 +3162,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3130 inode->i_mapping, GFP_NOFS); 3162 inode->i_mapping, GFP_NOFS);
3131 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3163 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3132 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); 3164 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3165 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3133 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3166 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3134 mutex_init(&BTRFS_I(inode)->extent_mutex); 3167 mutex_init(&BTRFS_I(inode)->extent_mutex);
3135 mutex_init(&BTRFS_I(inode)->log_mutex); 3168 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3152,26 +3185,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
3152 args->root == BTRFS_I(inode)->root; 3185 args->root == BTRFS_I(inode)->root;
3153} 3186}
3154 3187
3155struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, 3188static struct inode *btrfs_iget_locked(struct super_block *s,
3156 struct btrfs_root *root, int wait) 3189 u64 objectid,
3157{ 3190 struct btrfs_root *root)
3158 struct inode *inode;
3159 struct btrfs_iget_args args;
3160 args.ino = objectid;
3161 args.root = root;
3162
3163 if (wait) {
3164 inode = ilookup5(s, objectid, btrfs_find_actor,
3165 (void *)&args);
3166 } else {
3167 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3168 (void *)&args);
3169 }
3170 return inode;
3171}
3172
3173struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3174 struct btrfs_root *root)
3175{ 3191{
3176 struct inode *inode; 3192 struct inode *inode;
3177 struct btrfs_iget_args args; 3193 struct btrfs_iget_args args;
@@ -3188,24 +3204,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3188 * Returns in *is_new if the inode was read from disk 3204 * Returns in *is_new if the inode was read from disk
3189 */ 3205 */
3190struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3206struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3191 struct btrfs_root *root, int *is_new) 3207 struct btrfs_root *root)
3192{ 3208{
3193 struct inode *inode; 3209 struct inode *inode;
3194 3210
3195 inode = btrfs_iget_locked(s, location->objectid, root); 3211 inode = btrfs_iget_locked(s, location->objectid, root);
3196 if (!inode) 3212 if (!inode)
3197 return ERR_PTR(-EACCES); 3213 return ERR_PTR(-ENOMEM);
3198 3214
3199 if (inode->i_state & I_NEW) { 3215 if (inode->i_state & I_NEW) {
3200 BTRFS_I(inode)->root = root; 3216 BTRFS_I(inode)->root = root;
3201 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3217 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3202 btrfs_read_locked_inode(inode); 3218 btrfs_read_locked_inode(inode);
3219
3220 inode_tree_add(inode);
3203 unlock_new_inode(inode); 3221 unlock_new_inode(inode);
3204 if (is_new)
3205 *is_new = 1;
3206 } else {
3207 if (is_new)
3208 *is_new = 0;
3209 } 3222 }
3210 3223
3211 return inode; 3224 return inode;
@@ -3218,7 +3231,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3218 struct btrfs_root *root = bi->root; 3231 struct btrfs_root *root = bi->root;
3219 struct btrfs_root *sub_root = root; 3232 struct btrfs_root *sub_root = root;
3220 struct btrfs_key location; 3233 struct btrfs_key location;
3221 int ret, new; 3234 int ret;
3222 3235
3223 if (dentry->d_name.len > BTRFS_NAME_LEN) 3236 if (dentry->d_name.len > BTRFS_NAME_LEN)
3224 return ERR_PTR(-ENAMETOOLONG); 3237 return ERR_PTR(-ENAMETOOLONG);
@@ -3236,7 +3249,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3236 return ERR_PTR(ret); 3249 return ERR_PTR(ret);
3237 if (ret > 0) 3250 if (ret > 0)
3238 return ERR_PTR(-ENOENT); 3251 return ERR_PTR(-ENOENT);
3239 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); 3252 inode = btrfs_iget(dir->i_sb, &location, sub_root);
3240 if (IS_ERR(inode)) 3253 if (IS_ERR(inode))
3241 return ERR_CAST(inode); 3254 return ERR_CAST(inode);
3242 } 3255 }
@@ -3574,9 +3587,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3574 btrfs_find_block_group(root, 0, alloc_hint, owner); 3587 btrfs_find_block_group(root, 0, alloc_hint, owner);
3575 if ((mode & S_IFREG)) { 3588 if ((mode & S_IFREG)) {
3576 if (btrfs_test_opt(root, NODATASUM)) 3589 if (btrfs_test_opt(root, NODATASUM))
3577 btrfs_set_flag(inode, NODATASUM); 3590 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
3578 if (btrfs_test_opt(root, NODATACOW)) 3591 if (btrfs_test_opt(root, NODATACOW))
3579 btrfs_set_flag(inode, NODATACOW); 3592 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
3580 } 3593 }
3581 3594
3582 key[0].objectid = objectid; 3595 key[0].objectid = objectid;
@@ -3630,7 +3643,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3630 location->offset = 0; 3643 location->offset = 0;
3631 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 3644 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3632 3645
3646 btrfs_inherit_iflags(inode, dir);
3647
3633 insert_inode_hash(inode); 3648 insert_inode_hash(inode);
3649 inode_tree_add(inode);
3634 return inode; 3650 return inode;
3635fail: 3651fail:
3636 if (dir) 3652 if (dir)
@@ -3750,7 +3766,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3750 init_special_inode(inode, inode->i_mode, rdev); 3766 init_special_inode(inode, inode->i_mode, rdev);
3751 btrfs_update_inode(trans, root, inode); 3767 btrfs_update_inode(trans, root, inode);
3752 } 3768 }
3753 dir->i_sb->s_dirt = 1;
3754 btrfs_update_inode_block_group(trans, inode); 3769 btrfs_update_inode_block_group(trans, inode);
3755 btrfs_update_inode_block_group(trans, dir); 3770 btrfs_update_inode_block_group(trans, dir);
3756out_unlock: 3771out_unlock:
@@ -3815,7 +3830,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3815 inode->i_op = &btrfs_file_inode_operations; 3830 inode->i_op = &btrfs_file_inode_operations;
3816 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 3831 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3817 } 3832 }
3818 dir->i_sb->s_dirt = 1;
3819 btrfs_update_inode_block_group(trans, inode); 3833 btrfs_update_inode_block_group(trans, inode);
3820 btrfs_update_inode_block_group(trans, dir); 3834 btrfs_update_inode_block_group(trans, dir);
3821out_unlock: 3835out_unlock:
@@ -3862,7 +3876,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3862 if (err) 3876 if (err)
3863 drop_inode = 1; 3877 drop_inode = 1;
3864 3878
3865 dir->i_sb->s_dirt = 1;
3866 btrfs_update_inode_block_group(trans, dir); 3879 btrfs_update_inode_block_group(trans, dir);
3867 err = btrfs_update_inode(trans, root, inode); 3880 err = btrfs_update_inode(trans, root, inode);
3868 3881
@@ -3944,7 +3957,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3944 3957
3945 d_instantiate(dentry, inode); 3958 d_instantiate(dentry, inode);
3946 drop_on_err = 0; 3959 drop_on_err = 0;
3947 dir->i_sb->s_dirt = 1;
3948 btrfs_update_inode_block_group(trans, inode); 3960 btrfs_update_inode_block_group(trans, inode);
3949 btrfs_update_inode_block_group(trans, dir); 3961 btrfs_update_inode_block_group(trans, dir);
3950 3962
@@ -4683,6 +4695,7 @@ void btrfs_destroy_inode(struct inode *inode)
4683 btrfs_put_ordered_extent(ordered); 4695 btrfs_put_ordered_extent(ordered);
4684 } 4696 }
4685 } 4697 }
4698 inode_tree_del(inode);
4686 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 4699 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4687 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 4700 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4688} 4701}
@@ -4972,7 +4985,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4972 inode->i_op = &btrfs_file_inode_operations; 4985 inode->i_op = &btrfs_file_inode_operations;
4973 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4986 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4974 } 4987 }
4975 dir->i_sb->s_dirt = 1;
4976 btrfs_update_inode_block_group(trans, inode); 4988 btrfs_update_inode_block_group(trans, inode);
4977 btrfs_update_inode_block_group(trans, dir); 4989 btrfs_update_inode_block_group(trans, dir);
4978 if (drop_inode) 4990 if (drop_inode)
@@ -5061,7 +5073,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5061out: 5073out:
5062 if (cur_offset > start) { 5074 if (cur_offset > start) {
5063 inode->i_ctime = CURRENT_TIME; 5075 inode->i_ctime = CURRENT_TIME;
5064 btrfs_set_flag(inode, PREALLOC); 5076 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5065 if (!(mode & FALLOC_FL_KEEP_SIZE) && 5077 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5066 cur_offset > i_size_read(inode)) 5078 cur_offset > i_size_read(inode))
5067 btrfs_i_size_write(inode, cur_offset); 5079 btrfs_i_size_write(inode, cur_offset);
@@ -5182,7 +5194,7 @@ static int btrfs_set_page_dirty(struct page *page)
5182 5194
5183static int btrfs_permission(struct inode *inode, int mask) 5195static int btrfs_permission(struct inode *inode, int mask)
5184{ 5196{
5185 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) 5197 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
5186 return -EACCES; 5198 return -EACCES;
5187 return generic_permission(inode, mask, btrfs_check_acl); 5199 return generic_permission(inode, mask, btrfs_check_acl);
5188} 5200}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2624b53ea783..eff18f5b5362 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,7 +50,177 @@
50#include "volumes.h" 50#include "volumes.h"
51#include "locking.h" 51#include "locking.h"
52 52
53/* Mask out flags that are inappropriate for the given type of inode. */
54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
55{
56 if (S_ISDIR(mode))
57 return flags;
58 else if (S_ISREG(mode))
59 return flags & ~FS_DIRSYNC_FL;
60 else
61 return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
62}
63
64/*
65 * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
66 */
67static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
68{
69 unsigned int iflags = 0;
70
71 if (flags & BTRFS_INODE_SYNC)
72 iflags |= FS_SYNC_FL;
73 if (flags & BTRFS_INODE_IMMUTABLE)
74 iflags |= FS_IMMUTABLE_FL;
75 if (flags & BTRFS_INODE_APPEND)
76 iflags |= FS_APPEND_FL;
77 if (flags & BTRFS_INODE_NODUMP)
78 iflags |= FS_NODUMP_FL;
79 if (flags & BTRFS_INODE_NOATIME)
80 iflags |= FS_NOATIME_FL;
81 if (flags & BTRFS_INODE_DIRSYNC)
82 iflags |= FS_DIRSYNC_FL;
83
84 return iflags;
85}
86
87/*
88 * Update inode->i_flags based on the btrfs internal flags.
89 */
90void btrfs_update_iflags(struct inode *inode)
91{
92 struct btrfs_inode *ip = BTRFS_I(inode);
93
94 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
95
96 if (ip->flags & BTRFS_INODE_SYNC)
97 inode->i_flags |= S_SYNC;
98 if (ip->flags & BTRFS_INODE_IMMUTABLE)
99 inode->i_flags |= S_IMMUTABLE;
100 if (ip->flags & BTRFS_INODE_APPEND)
101 inode->i_flags |= S_APPEND;
102 if (ip->flags & BTRFS_INODE_NOATIME)
103 inode->i_flags |= S_NOATIME;
104 if (ip->flags & BTRFS_INODE_DIRSYNC)
105 inode->i_flags |= S_DIRSYNC;
106}
107
108/*
109 * Inherit flags from the parent inode.
110 *
111 * Unlike extN we don't have any flags we don't want to inherit currently.
112 */
113void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
114{
115 unsigned int flags;
116
117 if (!dir)
118 return;
119
120 flags = BTRFS_I(dir)->flags;
121
122 if (S_ISREG(inode->i_mode))
123 flags &= ~BTRFS_INODE_DIRSYNC;
124 else if (!S_ISDIR(inode->i_mode))
125 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
126
127 BTRFS_I(inode)->flags = flags;
128 btrfs_update_iflags(inode);
129}
130
131static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
132{
133 struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
134 unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
135
136 if (copy_to_user(arg, &flags, sizeof(flags)))
137 return -EFAULT;
138 return 0;
139}
140
141static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
142{
143 struct inode *inode = file->f_path.dentry->d_inode;
144 struct btrfs_inode *ip = BTRFS_I(inode);
145 struct btrfs_root *root = ip->root;
146 struct btrfs_trans_handle *trans;
147 unsigned int flags, oldflags;
148 int ret;
149
150 if (copy_from_user(&flags, arg, sizeof(flags)))
151 return -EFAULT;
152
153 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
154 FS_NOATIME_FL | FS_NODUMP_FL | \
155 FS_SYNC_FL | FS_DIRSYNC_FL))
156 return -EOPNOTSUPP;
53 157
158 if (!is_owner_or_cap(inode))
159 return -EACCES;
160
161 mutex_lock(&inode->i_mutex);
162
163 flags = btrfs_mask_flags(inode->i_mode, flags);
164 oldflags = btrfs_flags_to_ioctl(ip->flags);
165 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
166 if (!capable(CAP_LINUX_IMMUTABLE)) {
167 ret = -EPERM;
168 goto out_unlock;
169 }
170 }
171
172 ret = mnt_want_write(file->f_path.mnt);
173 if (ret)
174 goto out_unlock;
175
176 if (flags & FS_SYNC_FL)
177 ip->flags |= BTRFS_INODE_SYNC;
178 else
179 ip->flags &= ~BTRFS_INODE_SYNC;
180 if (flags & FS_IMMUTABLE_FL)
181 ip->flags |= BTRFS_INODE_IMMUTABLE;
182 else
183 ip->flags &= ~BTRFS_INODE_IMMUTABLE;
184 if (flags & FS_APPEND_FL)
185 ip->flags |= BTRFS_INODE_APPEND;
186 else
187 ip->flags &= ~BTRFS_INODE_APPEND;
188 if (flags & FS_NODUMP_FL)
189 ip->flags |= BTRFS_INODE_NODUMP;
190 else
191 ip->flags &= ~BTRFS_INODE_NODUMP;
192 if (flags & FS_NOATIME_FL)
193 ip->flags |= BTRFS_INODE_NOATIME;
194 else
195 ip->flags &= ~BTRFS_INODE_NOATIME;
196 if (flags & FS_DIRSYNC_FL)
197 ip->flags |= BTRFS_INODE_DIRSYNC;
198 else
199 ip->flags &= ~BTRFS_INODE_DIRSYNC;
200
201
202 trans = btrfs_join_transaction(root, 1);
203 BUG_ON(!trans);
204
205 ret = btrfs_update_inode(trans, root, inode);
206 BUG_ON(ret);
207
208 btrfs_update_iflags(inode);
209 inode->i_ctime = CURRENT_TIME;
210 btrfs_end_transaction(trans, root);
211
212 mnt_drop_write(file->f_path.mnt);
213 out_unlock:
214 mutex_unlock(&inode->i_mutex);
215 return 0;
216}
217
218static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
219{
220 struct inode *inode = file->f_path.dentry->d_inode;
221
222 return put_user(inode->i_generation, arg);
223}
54 224
55static noinline int create_subvol(struct btrfs_root *root, 225static noinline int create_subvol(struct btrfs_root *root,
56 struct dentry *dentry, 226 struct dentry *dentry,
@@ -82,22 +252,25 @@ static noinline int create_subvol(struct btrfs_root *root,
82 if (ret) 252 if (ret)
83 goto fail; 253 goto fail;
84 254
85 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 255 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
86 objectid, trans->transid, 0, 0, 0); 256 0, objectid, NULL, 0, 0, 0);
87 if (IS_ERR(leaf)) { 257 if (IS_ERR(leaf)) {
88 ret = PTR_ERR(leaf); 258 ret = PTR_ERR(leaf);
89 goto fail; 259 goto fail;
90 } 260 }
91 261
92 btrfs_set_header_nritems(leaf, 0); 262 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
93 btrfs_set_header_level(leaf, 0);
94 btrfs_set_header_bytenr(leaf, leaf->start); 263 btrfs_set_header_bytenr(leaf, leaf->start);
95 btrfs_set_header_generation(leaf, trans->transid); 264 btrfs_set_header_generation(leaf, trans->transid);
265 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
96 btrfs_set_header_owner(leaf, objectid); 266 btrfs_set_header_owner(leaf, objectid);
97 267
98 write_extent_buffer(leaf, root->fs_info->fsid, 268 write_extent_buffer(leaf, root->fs_info->fsid,
99 (unsigned long)btrfs_header_fsid(leaf), 269 (unsigned long)btrfs_header_fsid(leaf),
100 BTRFS_FSID_SIZE); 270 BTRFS_FSID_SIZE);
271 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
272 (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
273 BTRFS_UUID_SIZE);
101 btrfs_mark_buffer_dirty(leaf); 274 btrfs_mark_buffer_dirty(leaf);
102 275
103 inode_item = &root_item.inode; 276 inode_item = &root_item.inode;
@@ -125,7 +298,7 @@ static noinline int create_subvol(struct btrfs_root *root,
125 btrfs_set_root_dirid(&root_item, new_dirid); 298 btrfs_set_root_dirid(&root_item, new_dirid);
126 299
127 key.objectid = objectid; 300 key.objectid = objectid;
128 key.offset = 1; 301 key.offset = 0;
129 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 302 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
130 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 303 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
131 &root_item); 304 &root_item);
@@ -911,10 +1084,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
911 if (disko) { 1084 if (disko) {
912 inode_add_bytes(inode, datal); 1085 inode_add_bytes(inode, datal);
913 ret = btrfs_inc_extent_ref(trans, root, 1086 ret = btrfs_inc_extent_ref(trans, root,
914 disko, diskl, leaf->start, 1087 disko, diskl, 0,
915 root->root_key.objectid, 1088 root->root_key.objectid,
916 trans->transid, 1089 inode->i_ino,
917 inode->i_ino); 1090 new_key.offset - datao);
918 BUG_ON(ret); 1091 BUG_ON(ret);
919 } 1092 }
920 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 1093 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -1074,6 +1247,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1074 void __user *argp = (void __user *)arg; 1247 void __user *argp = (void __user *)arg;
1075 1248
1076 switch (cmd) { 1249 switch (cmd) {
1250 case FS_IOC_GETFLAGS:
1251 return btrfs_ioctl_getflags(file, argp);
1252 case FS_IOC_SETFLAGS:
1253 return btrfs_ioctl_setflags(file, argp);
1254 case FS_IOC_GETVERSION:
1255 return btrfs_ioctl_getversion(file, argp);
1077 case BTRFS_IOC_SNAP_CREATE: 1256 case BTRFS_IOC_SNAP_CREATE:
1078 return btrfs_ioctl_snap_create(file, argp, 0); 1257 return btrfs_ioctl_snap_create(file, argp, 0);
1079 case BTRFS_IOC_SUBVOL_CREATE: 1258 case BTRFS_IOC_SUBVOL_CREATE:
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5f8f218c1005..6d6523da0a30 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb,
45 (unsigned long long)btrfs_device_total_bytes(eb, dev_item), 45 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
46 (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); 46 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
47} 47}
48static void print_extent_data_ref(struct extent_buffer *eb,
49 struct btrfs_extent_data_ref *ref)
50{
51 printk(KERN_INFO "\t\textent data backref root %llu "
52 "objectid %llu offset %llu count %u\n",
53 (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
54 (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
55 (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
56 btrfs_extent_data_ref_count(eb, ref));
57}
58
59static void print_extent_item(struct extent_buffer *eb, int slot)
60{
61 struct btrfs_extent_item *ei;
62 struct btrfs_extent_inline_ref *iref;
63 struct btrfs_extent_data_ref *dref;
64 struct btrfs_shared_data_ref *sref;
65 struct btrfs_disk_key key;
66 unsigned long end;
67 unsigned long ptr;
68 int type;
69 u32 item_size = btrfs_item_size_nr(eb, slot);
70 u64 flags;
71 u64 offset;
72
73 if (item_size < sizeof(*ei)) {
74#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
75 struct btrfs_extent_item_v0 *ei0;
76 BUG_ON(item_size != sizeof(*ei0));
77 ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
78 printk(KERN_INFO "\t\textent refs %u\n",
79 btrfs_extent_refs_v0(eb, ei0));
80 return;
81#else
82 BUG();
83#endif
84 }
85
86 ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
87 flags = btrfs_extent_flags(eb, ei);
88
89 printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
90 (unsigned long long)btrfs_extent_refs(eb, ei),
91 (unsigned long long)btrfs_extent_generation(eb, ei),
92 (unsigned long long)flags);
93
94 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
95 struct btrfs_tree_block_info *info;
96 info = (struct btrfs_tree_block_info *)(ei + 1);
97 btrfs_tree_block_key(eb, info, &key);
98 printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
99 "level %d\n",
100 (unsigned long long)btrfs_disk_key_objectid(&key),
101 key.type,
102 (unsigned long long)btrfs_disk_key_offset(&key),
103 btrfs_tree_block_level(eb, info));
104 iref = (struct btrfs_extent_inline_ref *)(info + 1);
105 } else {
106 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
107 }
108
109 ptr = (unsigned long)iref;
110 end = (unsigned long)ei + item_size;
111 while (ptr < end) {
112 iref = (struct btrfs_extent_inline_ref *)ptr;
113 type = btrfs_extent_inline_ref_type(eb, iref);
114 offset = btrfs_extent_inline_ref_offset(eb, iref);
115 switch (type) {
116 case BTRFS_TREE_BLOCK_REF_KEY:
117 printk(KERN_INFO "\t\ttree block backref "
118 "root %llu\n", (unsigned long long)offset);
119 break;
120 case BTRFS_SHARED_BLOCK_REF_KEY:
121 printk(KERN_INFO "\t\tshared block backref "
122 "parent %llu\n", (unsigned long long)offset);
123 break;
124 case BTRFS_EXTENT_DATA_REF_KEY:
125 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
126 print_extent_data_ref(eb, dref);
127 break;
128 case BTRFS_SHARED_DATA_REF_KEY:
129 sref = (struct btrfs_shared_data_ref *)(iref + 1);
130 printk(KERN_INFO "\t\tshared data backref "
131 "parent %llu count %u\n",
132 (unsigned long long)offset,
133 btrfs_shared_data_ref_count(eb, sref));
134 break;
135 default:
136 BUG();
137 }
138 ptr += btrfs_extent_inline_ref_size(type);
139 }
140 WARN_ON(ptr > end);
141}
142
143#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
144static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
145{
146 struct btrfs_extent_ref_v0 *ref0;
147
148 ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
149 printk("\t\textent back ref root %llu gen %llu "
150 "owner %llu num_refs %lu\n",
151 (unsigned long long)btrfs_ref_root_v0(eb, ref0),
152 (unsigned long long)btrfs_ref_generation_v0(eb, ref0),
153 (unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
154 (unsigned long)btrfs_ref_count_v0(eb, ref0));
155}
156#endif
157
48void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
49{ 159{
50 int i; 160 int i;
161 u32 type;
51 u32 nr = btrfs_header_nritems(l); 162 u32 nr = btrfs_header_nritems(l);
52 struct btrfs_item *item; 163 struct btrfs_item *item;
53 struct btrfs_extent_item *ei;
54 struct btrfs_root_item *ri; 164 struct btrfs_root_item *ri;
55 struct btrfs_dir_item *di; 165 struct btrfs_dir_item *di;
56 struct btrfs_inode_item *ii; 166 struct btrfs_inode_item *ii;
57 struct btrfs_block_group_item *bi; 167 struct btrfs_block_group_item *bi;
58 struct btrfs_file_extent_item *fi; 168 struct btrfs_file_extent_item *fi;
169 struct btrfs_extent_data_ref *dref;
170 struct btrfs_shared_data_ref *sref;
171 struct btrfs_dev_extent *dev_extent;
59 struct btrfs_key key; 172 struct btrfs_key key;
60 struct btrfs_key found_key; 173 struct btrfs_key found_key;
61 struct btrfs_extent_ref *ref;
62 struct btrfs_dev_extent *dev_extent;
63 u32 type;
64 174
65 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
66 (unsigned long long)btrfs_header_bytenr(l), nr, 176 (unsigned long long)btrfs_header_bytenr(l), nr,
@@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
100 btrfs_disk_root_refs(l, ri)); 210 btrfs_disk_root_refs(l, ri));
101 break; 211 break;
102 case BTRFS_EXTENT_ITEM_KEY: 212 case BTRFS_EXTENT_ITEM_KEY:
103 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); 213 print_extent_item(l, i);
104 printk(KERN_INFO "\t\textent data refs %u\n", 214 break;
105 btrfs_extent_refs(l, ei)); 215 case BTRFS_TREE_BLOCK_REF_KEY:
106 break; 216 printk(KERN_INFO "\t\ttree block backref\n");
107 case BTRFS_EXTENT_REF_KEY: 217 break;
108 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); 218 case BTRFS_SHARED_BLOCK_REF_KEY:
109 printk(KERN_INFO "\t\textent back ref root %llu " 219 printk(KERN_INFO "\t\tshared block backref\n");
110 "gen %llu owner %llu num_refs %lu\n", 220 break;
111 (unsigned long long)btrfs_ref_root(l, ref), 221 case BTRFS_EXTENT_DATA_REF_KEY:
112 (unsigned long long)btrfs_ref_generation(l, ref), 222 dref = btrfs_item_ptr(l, i,
113 (unsigned long long)btrfs_ref_objectid(l, ref), 223 struct btrfs_extent_data_ref);
114 (unsigned long)btrfs_ref_num_refs(l, ref)); 224 print_extent_data_ref(l, dref);
225 break;
226 case BTRFS_SHARED_DATA_REF_KEY:
227 sref = btrfs_item_ptr(l, i,
228 struct btrfs_shared_data_ref);
229 printk(KERN_INFO "\t\tshared data backref count %u\n",
230 btrfs_shared_data_ref_count(l, sref));
115 break; 231 break;
116
117 case BTRFS_EXTENT_DATA_KEY: 232 case BTRFS_EXTENT_DATA_KEY:
118 fi = btrfs_item_ptr(l, i, 233 fi = btrfs_item_ptr(l, i,
119 struct btrfs_file_extent_item); 234 struct btrfs_file_extent_item);
@@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
139 (unsigned long long) 254 (unsigned long long)
140 btrfs_file_extent_ram_bytes(l, fi)); 255 btrfs_file_extent_ram_bytes(l, fi));
141 break; 256 break;
257 case BTRFS_EXTENT_REF_V0_KEY:
258#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
259 print_extent_ref_v0(l, i);
260#else
261 BUG();
262#endif
142 case BTRFS_BLOCK_GROUP_ITEM_KEY: 263 case BTRFS_BLOCK_GROUP_ITEM_KEY:
143 bi = btrfs_item_ptr(l, i, 264 bi = btrfs_item_ptr(l, i,
144 struct btrfs_block_group_item); 265 struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 000000000000..b23dc209ae10
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3711 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include "ctree.h"
25#include "disk-io.h"
26#include "transaction.h"
27#include "volumes.h"
28#include "locking.h"
29#include "btrfs_inode.h"
30#include "async-thread.h"
31
32/*
33 * backref_node, mapping_node and tree_block start with this
34 */
35struct tree_entry {
36 struct rb_node rb_node;
37 u64 bytenr;
38};
39
40/*
41 * present a tree block in the backref cache
42 */
43struct backref_node {
44 struct rb_node rb_node;
45 u64 bytenr;
46 /* objectid tree block owner */
47 u64 owner;
48 /* list of upper level blocks reference this block */
49 struct list_head upper;
50 /* list of child blocks in the cache */
51 struct list_head lower;
52 /* NULL if this node is not tree root */
53 struct btrfs_root *root;
54 /* extent buffer got by COW the block */
55 struct extent_buffer *eb;
56 /* level of tree block */
57 unsigned int level:8;
58 /* 1 if the block is root of old snapshot */
59 unsigned int old_root:1;
60 /* 1 if no child blocks in the cache */
61 unsigned int lowest:1;
62 /* is the extent buffer locked */
63 unsigned int locked:1;
64 /* has the block been processed */
65 unsigned int processed:1;
66 /* have backrefs of this block been checked */
67 unsigned int checked:1;
68};
69
70/*
71 * present a block pointer in the backref cache
72 */
73struct backref_edge {
74 struct list_head list[2];
75 struct backref_node *node[2];
76 u64 blockptr;
77};
78
79#define LOWER 0
80#define UPPER 1
81
82struct backref_cache {
83 /* red black tree of all backref nodes in the cache */
84 struct rb_root rb_root;
85 /* list of backref nodes with no child block in the cache */
86 struct list_head pending[BTRFS_MAX_LEVEL];
87 spinlock_t lock;
88};
89
90/*
91 * map address of tree root to tree
92 */
93struct mapping_node {
94 struct rb_node rb_node;
95 u64 bytenr;
96 void *data;
97};
98
99struct mapping_tree {
100 struct rb_root rb_root;
101 spinlock_t lock;
102};
103
104/*
105 * present a tree block to process
106 */
107struct tree_block {
108 struct rb_node rb_node;
109 u64 bytenr;
110 struct btrfs_key key;
111 unsigned int level:8;
112 unsigned int key_ready:1;
113};
114
115/* inode vector */
116#define INODEVEC_SIZE 16
117
118struct inodevec {
119 struct list_head list;
120 struct inode *inode[INODEVEC_SIZE];
121 int nr;
122};
123
124struct reloc_control {
125 /* block group to relocate */
126 struct btrfs_block_group_cache *block_group;
127 /* extent tree */
128 struct btrfs_root *extent_root;
129 /* inode for moving data */
130 struct inode *data_inode;
131 struct btrfs_workers workers;
132 /* tree blocks have been processed */
133 struct extent_io_tree processed_blocks;
134 /* map start of tree root to corresponding reloc tree */
135 struct mapping_tree reloc_root_tree;
136 /* list of reloc trees */
137 struct list_head reloc_roots;
138 u64 search_start;
139 u64 extents_found;
140 u64 extents_skipped;
141 int stage;
142 int create_reloc_root;
143 unsigned int found_file_extent:1;
144 unsigned int found_old_snapshot:1;
145};
146
147/* stages of data relocation */
148#define MOVE_DATA_EXTENTS 0
149#define UPDATE_DATA_PTRS 1
150
151/*
152 * merge reloc tree to corresponding fs tree in worker threads
153 */
154struct async_merge {
155 struct btrfs_work work;
156 struct reloc_control *rc;
157 struct btrfs_root *root;
158 struct completion *done;
159 atomic_t *num_pending;
160};
161
162static void mapping_tree_init(struct mapping_tree *tree)
163{
164 tree->rb_root.rb_node = NULL;
165 spin_lock_init(&tree->lock);
166}
167
168static void backref_cache_init(struct backref_cache *cache)
169{
170 int i;
171 cache->rb_root.rb_node = NULL;
172 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
173 INIT_LIST_HEAD(&cache->pending[i]);
174 spin_lock_init(&cache->lock);
175}
176
177static void backref_node_init(struct backref_node *node)
178{
179 memset(node, 0, sizeof(*node));
180 INIT_LIST_HEAD(&node->upper);
181 INIT_LIST_HEAD(&node->lower);
182 RB_CLEAR_NODE(&node->rb_node);
183}
184
185static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
186 struct rb_node *node)
187{
188 struct rb_node **p = &root->rb_node;
189 struct rb_node *parent = NULL;
190 struct tree_entry *entry;
191
192 while (*p) {
193 parent = *p;
194 entry = rb_entry(parent, struct tree_entry, rb_node);
195
196 if (bytenr < entry->bytenr)
197 p = &(*p)->rb_left;
198 else if (bytenr > entry->bytenr)
199 p = &(*p)->rb_right;
200 else
201 return parent;
202 }
203
204 rb_link_node(node, parent, p);
205 rb_insert_color(node, root);
206 return NULL;
207}
208
209static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
210{
211 struct rb_node *n = root->rb_node;
212 struct tree_entry *entry;
213
214 while (n) {
215 entry = rb_entry(n, struct tree_entry, rb_node);
216
217 if (bytenr < entry->bytenr)
218 n = n->rb_left;
219 else if (bytenr > entry->bytenr)
220 n = n->rb_right;
221 else
222 return n;
223 }
224 return NULL;
225}
226
227/*
228 * walk up backref nodes until reach node presents tree root
229 */
230static struct backref_node *walk_up_backref(struct backref_node *node,
231 struct backref_edge *edges[],
232 int *index)
233{
234 struct backref_edge *edge;
235 int idx = *index;
236
237 while (!list_empty(&node->upper)) {
238 edge = list_entry(node->upper.next,
239 struct backref_edge, list[LOWER]);
240 edges[idx++] = edge;
241 node = edge->node[UPPER];
242 }
243 *index = idx;
244 return node;
245}
246
247/*
248 * walk down backref nodes to find start of next reference path
249 */
250static struct backref_node *walk_down_backref(struct backref_edge *edges[],
251 int *index)
252{
253 struct backref_edge *edge;
254 struct backref_node *lower;
255 int idx = *index;
256
257 while (idx > 0) {
258 edge = edges[idx - 1];
259 lower = edge->node[LOWER];
260 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
261 idx--;
262 continue;
263 }
264 edge = list_entry(edge->list[LOWER].next,
265 struct backref_edge, list[LOWER]);
266 edges[idx - 1] = edge;
267 *index = idx;
268 return edge->node[UPPER];
269 }
270 *index = 0;
271 return NULL;
272}
273
274static void drop_node_buffer(struct backref_node *node)
275{
276 if (node->eb) {
277 if (node->locked) {
278 btrfs_tree_unlock(node->eb);
279 node->locked = 0;
280 }
281 free_extent_buffer(node->eb);
282 node->eb = NULL;
283 }
284}
285
286static void drop_backref_node(struct backref_cache *tree,
287 struct backref_node *node)
288{
289 BUG_ON(!node->lowest);
290 BUG_ON(!list_empty(&node->upper));
291
292 drop_node_buffer(node);
293 list_del(&node->lower);
294
295 rb_erase(&node->rb_node, &tree->rb_root);
296 kfree(node);
297}
298
299/*
300 * remove a backref node from the backref cache
301 */
302static void remove_backref_node(struct backref_cache *cache,
303 struct backref_node *node)
304{
305 struct backref_node *upper;
306 struct backref_edge *edge;
307
308 if (!node)
309 return;
310
311 BUG_ON(!node->lowest);
312 while (!list_empty(&node->upper)) {
313 edge = list_entry(node->upper.next, struct backref_edge,
314 list[LOWER]);
315 upper = edge->node[UPPER];
316 list_del(&edge->list[LOWER]);
317 list_del(&edge->list[UPPER]);
318 kfree(edge);
319 /*
320 * add the node to pending list if no other
321 * child block cached.
322 */
323 if (list_empty(&upper->lower)) {
324 list_add_tail(&upper->lower,
325 &cache->pending[upper->level]);
326 upper->lowest = 1;
327 }
328 }
329 drop_backref_node(cache, node);
330}
331
332/*
333 * find reloc tree by address of tree root
334 */
335static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
336 u64 bytenr)
337{
338 struct rb_node *rb_node;
339 struct mapping_node *node;
340 struct btrfs_root *root = NULL;
341
342 spin_lock(&rc->reloc_root_tree.lock);
343 rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
344 if (rb_node) {
345 node = rb_entry(rb_node, struct mapping_node, rb_node);
346 root = (struct btrfs_root *)node->data;
347 }
348 spin_unlock(&rc->reloc_root_tree.lock);
349 return root;
350}
351
352static int is_cowonly_root(u64 root_objectid)
353{
354 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
355 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
356 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
357 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
358 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
359 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
360 return 1;
361 return 0;
362}
363
364static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
365 u64 root_objectid)
366{
367 struct btrfs_key key;
368
369 key.objectid = root_objectid;
370 key.type = BTRFS_ROOT_ITEM_KEY;
371 if (is_cowonly_root(root_objectid))
372 key.offset = 0;
373 else
374 key.offset = (u64)-1;
375
376 return btrfs_read_fs_root_no_name(fs_info, &key);
377}
378
379#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
380static noinline_for_stack
381struct btrfs_root *find_tree_root(struct reloc_control *rc,
382 struct extent_buffer *leaf,
383 struct btrfs_extent_ref_v0 *ref0)
384{
385 struct btrfs_root *root;
386 u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
387 u64 generation = btrfs_ref_generation_v0(leaf, ref0);
388
389 BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
390
391 root = read_fs_root(rc->extent_root->fs_info, root_objectid);
392 BUG_ON(IS_ERR(root));
393
394 if (root->ref_cows &&
395 generation != btrfs_root_generation(&root->root_item))
396 return NULL;
397
398 return root;
399}
400#endif
401
402static noinline_for_stack
403int find_inline_backref(struct extent_buffer *leaf, int slot,
404 unsigned long *ptr, unsigned long *end)
405{
406 struct btrfs_extent_item *ei;
407 struct btrfs_tree_block_info *bi;
408 u32 item_size;
409
410 item_size = btrfs_item_size_nr(leaf, slot);
411#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
412 if (item_size < sizeof(*ei)) {
413 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
414 return 1;
415 }
416#endif
417 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
418 WARN_ON(!(btrfs_extent_flags(leaf, ei) &
419 BTRFS_EXTENT_FLAG_TREE_BLOCK));
420
421 if (item_size <= sizeof(*ei) + sizeof(*bi)) {
422 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
423 return 1;
424 }
425
426 bi = (struct btrfs_tree_block_info *)(ei + 1);
427 *ptr = (unsigned long)(bi + 1);
428 *end = (unsigned long)ei + item_size;
429 return 0;
430}
431
432/*
433 * build backref tree for a given tree block. root of the backref tree
434 * corresponds the tree block, leaves of the backref tree correspond
435 * roots of b-trees that reference the tree block.
436 *
437 * the basic idea of this function is check backrefs of a given block
438 * to find upper level blocks that refernece the block, and then check
439 * bakcrefs of these upper level blocks recursively. the recursion stop
440 * when tree root is reached or backrefs for the block is cached.
441 *
442 * NOTE: if we find backrefs for a block are cached, we know backrefs
443 * for all upper level blocks that directly/indirectly reference the
444 * block are also cached.
445 */
446static struct backref_node *build_backref_tree(struct reloc_control *rc,
447 struct backref_cache *cache,
448 struct btrfs_key *node_key,
449 int level, u64 bytenr)
450{
451 struct btrfs_path *path1;
452 struct btrfs_path *path2;
453 struct extent_buffer *eb;
454 struct btrfs_root *root;
455 struct backref_node *cur;
456 struct backref_node *upper;
457 struct backref_node *lower;
458 struct backref_node *node = NULL;
459 struct backref_node *exist = NULL;
460 struct backref_edge *edge;
461 struct rb_node *rb_node;
462 struct btrfs_key key;
463 unsigned long end;
464 unsigned long ptr;
465 LIST_HEAD(list);
466 int ret;
467 int err = 0;
468
469 path1 = btrfs_alloc_path();
470 path2 = btrfs_alloc_path();
471 if (!path1 || !path2) {
472 err = -ENOMEM;
473 goto out;
474 }
475
476 node = kmalloc(sizeof(*node), GFP_NOFS);
477 if (!node) {
478 err = -ENOMEM;
479 goto out;
480 }
481
482 backref_node_init(node);
483 node->bytenr = bytenr;
484 node->owner = 0;
485 node->level = level;
486 node->lowest = 1;
487 cur = node;
488again:
489 end = 0;
490 ptr = 0;
491 key.objectid = cur->bytenr;
492 key.type = BTRFS_EXTENT_ITEM_KEY;
493 key.offset = (u64)-1;
494
495 path1->search_commit_root = 1;
496 path1->skip_locking = 1;
497 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
498 0, 0);
499 if (ret < 0) {
500 err = ret;
501 goto out;
502 }
503 BUG_ON(!ret || !path1->slots[0]);
504
505 path1->slots[0]--;
506
507 WARN_ON(cur->checked);
508 if (!list_empty(&cur->upper)) {
509 /*
510 * the backref was added previously when processsing
511 * backref of type BTRFS_TREE_BLOCK_REF_KEY
512 */
513 BUG_ON(!list_is_singular(&cur->upper));
514 edge = list_entry(cur->upper.next, struct backref_edge,
515 list[LOWER]);
516 BUG_ON(!list_empty(&edge->list[UPPER]));
517 exist = edge->node[UPPER];
518 /*
519 * add the upper level block to pending list if we need
520 * check its backrefs
521 */
522 if (!exist->checked)
523 list_add_tail(&edge->list[UPPER], &list);
524 } else {
525 exist = NULL;
526 }
527
528 while (1) {
529 cond_resched();
530 eb = path1->nodes[0];
531
532 if (ptr >= end) {
533 if (path1->slots[0] >= btrfs_header_nritems(eb)) {
534 ret = btrfs_next_leaf(rc->extent_root, path1);
535 if (ret < 0) {
536 err = ret;
537 goto out;
538 }
539 if (ret > 0)
540 break;
541 eb = path1->nodes[0];
542 }
543
544 btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
545 if (key.objectid != cur->bytenr) {
546 WARN_ON(exist);
547 break;
548 }
549
550 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
551 ret = find_inline_backref(eb, path1->slots[0],
552 &ptr, &end);
553 if (ret)
554 goto next;
555 }
556 }
557
558 if (ptr < end) {
559 /* update key for inline back ref */
560 struct btrfs_extent_inline_ref *iref;
561 iref = (struct btrfs_extent_inline_ref *)ptr;
562 key.type = btrfs_extent_inline_ref_type(eb, iref);
563 key.offset = btrfs_extent_inline_ref_offset(eb, iref);
564 WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
565 key.type != BTRFS_SHARED_BLOCK_REF_KEY);
566 }
567
568 if (exist &&
569 ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
570 exist->owner == key.offset) ||
571 (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
572 exist->bytenr == key.offset))) {
573 exist = NULL;
574 goto next;
575 }
576
577#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
578 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
579 key.type == BTRFS_EXTENT_REF_V0_KEY) {
580 if (key.objectid == key.offset &&
581 key.type == BTRFS_EXTENT_REF_V0_KEY) {
582 struct btrfs_extent_ref_v0 *ref0;
583 ref0 = btrfs_item_ptr(eb, path1->slots[0],
584 struct btrfs_extent_ref_v0);
585 root = find_tree_root(rc, eb, ref0);
586 if (root)
587 cur->root = root;
588 else
589 cur->old_root = 1;
590 break;
591 }
592#else
593 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
594 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
595#endif
596 if (key.objectid == key.offset) {
597 /*
598 * only root blocks of reloc trees use
599 * backref of this type.
600 */
601 root = find_reloc_root(rc, cur->bytenr);
602 BUG_ON(!root);
603 cur->root = root;
604 break;
605 }
606
607 edge = kzalloc(sizeof(*edge), GFP_NOFS);
608 if (!edge) {
609 err = -ENOMEM;
610 goto out;
611 }
612 rb_node = tree_search(&cache->rb_root, key.offset);
613 if (!rb_node) {
614 upper = kmalloc(sizeof(*upper), GFP_NOFS);
615 if (!upper) {
616 kfree(edge);
617 err = -ENOMEM;
618 goto out;
619 }
620 backref_node_init(upper);
621 upper->bytenr = key.offset;
622 upper->owner = 0;
623 upper->level = cur->level + 1;
624 /*
625 * backrefs for the upper level block isn't
626 * cached, add the block to pending list
627 */
628 list_add_tail(&edge->list[UPPER], &list);
629 } else {
630 upper = rb_entry(rb_node, struct backref_node,
631 rb_node);
632 INIT_LIST_HEAD(&edge->list[UPPER]);
633 }
634 list_add(&edge->list[LOWER], &cur->upper);
635 edge->node[UPPER] = upper;
636 edge->node[LOWER] = cur;
637
638 goto next;
639 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
640 goto next;
641 }
642
643 /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
644 root = read_fs_root(rc->extent_root->fs_info, key.offset);
645 if (IS_ERR(root)) {
646 err = PTR_ERR(root);
647 goto out;
648 }
649
650 if (btrfs_root_level(&root->root_item) == cur->level) {
651 /* tree root */
652 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
653 cur->bytenr);
654 cur->root = root;
655 break;
656 }
657
658 level = cur->level + 1;
659
660 /*
661 * searching the tree to find upper level blocks
662 * reference the block.
663 */
664 path2->search_commit_root = 1;
665 path2->skip_locking = 1;
666 path2->lowest_level = level;
667 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
668 path2->lowest_level = 0;
669 if (ret < 0) {
670 err = ret;
671 goto out;
672 }
673
674 eb = path2->nodes[level];
675 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
676 cur->bytenr);
677
678 lower = cur;
679 for (; level < BTRFS_MAX_LEVEL; level++) {
680 if (!path2->nodes[level]) {
681 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
682 lower->bytenr);
683 lower->root = root;
684 break;
685 }
686
687 edge = kzalloc(sizeof(*edge), GFP_NOFS);
688 if (!edge) {
689 err = -ENOMEM;
690 goto out;
691 }
692
693 eb = path2->nodes[level];
694 rb_node = tree_search(&cache->rb_root, eb->start);
695 if (!rb_node) {
696 upper = kmalloc(sizeof(*upper), GFP_NOFS);
697 if (!upper) {
698 kfree(edge);
699 err = -ENOMEM;
700 goto out;
701 }
702 backref_node_init(upper);
703 upper->bytenr = eb->start;
704 upper->owner = btrfs_header_owner(eb);
705 upper->level = lower->level + 1;
706
707 /*
708 * if we know the block isn't shared
709 * we can void checking its backrefs.
710 */
711 if (btrfs_block_can_be_shared(root, eb))
712 upper->checked = 0;
713 else
714 upper->checked = 1;
715
716 /*
717 * add the block to pending list if we
718 * need check its backrefs. only block
719 * at 'cur->level + 1' is added to the
720 * tail of pending list. this guarantees
721 * we check backrefs from lower level
722 * blocks to upper level blocks.
723 */
724 if (!upper->checked &&
725 level == cur->level + 1) {
726 list_add_tail(&edge->list[UPPER],
727 &list);
728 } else
729 INIT_LIST_HEAD(&edge->list[UPPER]);
730 } else {
731 upper = rb_entry(rb_node, struct backref_node,
732 rb_node);
733 BUG_ON(!upper->checked);
734 INIT_LIST_HEAD(&edge->list[UPPER]);
735 }
736 list_add_tail(&edge->list[LOWER], &lower->upper);
737 edge->node[UPPER] = upper;
738 edge->node[LOWER] = lower;
739
740 if (rb_node)
741 break;
742 lower = upper;
743 upper = NULL;
744 }
745 btrfs_release_path(root, path2);
746next:
747 if (ptr < end) {
748 ptr += btrfs_extent_inline_ref_size(key.type);
749 if (ptr >= end) {
750 WARN_ON(ptr > end);
751 ptr = 0;
752 end = 0;
753 }
754 }
755 if (ptr >= end)
756 path1->slots[0]++;
757 }
758 btrfs_release_path(rc->extent_root, path1);
759
760 cur->checked = 1;
761 WARN_ON(exist);
762
763 /* the pending list isn't empty, take the first block to process */
764 if (!list_empty(&list)) {
765 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
766 list_del_init(&edge->list[UPPER]);
767 cur = edge->node[UPPER];
768 goto again;
769 }
770
771 /*
772 * everything goes well, connect backref nodes and insert backref nodes
773 * into the cache.
774 */
775 BUG_ON(!node->checked);
776 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
777 BUG_ON(rb_node);
778
779 list_for_each_entry(edge, &node->upper, list[LOWER])
780 list_add_tail(&edge->list[UPPER], &list);
781
782 while (!list_empty(&list)) {
783 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
784 list_del_init(&edge->list[UPPER]);
785 upper = edge->node[UPPER];
786
787 if (!RB_EMPTY_NODE(&upper->rb_node)) {
788 if (upper->lowest) {
789 list_del_init(&upper->lower);
790 upper->lowest = 0;
791 }
792
793 list_add_tail(&edge->list[UPPER], &upper->lower);
794 continue;
795 }
796
797 BUG_ON(!upper->checked);
798 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
799 &upper->rb_node);
800 BUG_ON(rb_node);
801
802 list_add_tail(&edge->list[UPPER], &upper->lower);
803
804 list_for_each_entry(edge, &upper->upper, list[LOWER])
805 list_add_tail(&edge->list[UPPER], &list);
806 }
807out:
808 btrfs_free_path(path1);
809 btrfs_free_path(path2);
810 if (err) {
811 INIT_LIST_HEAD(&list);
812 upper = node;
813 while (upper) {
814 if (RB_EMPTY_NODE(&upper->rb_node)) {
815 list_splice_tail(&upper->upper, &list);
816 kfree(upper);
817 }
818
819 if (list_empty(&list))
820 break;
821
822 edge = list_entry(list.next, struct backref_edge,
823 list[LOWER]);
824 upper = edge->node[UPPER];
825 kfree(edge);
826 }
827 return ERR_PTR(err);
828 }
829 return node;
830}
831
832/*
833 * helper to add 'address of tree root -> reloc tree' mapping
834 */
835static int __add_reloc_root(struct btrfs_root *root)
836{
837 struct rb_node *rb_node;
838 struct mapping_node *node;
839 struct reloc_control *rc = root->fs_info->reloc_ctl;
840
841 node = kmalloc(sizeof(*node), GFP_NOFS);
842 BUG_ON(!node);
843
844 node->bytenr = root->node->start;
845 node->data = root;
846
847 spin_lock(&rc->reloc_root_tree.lock);
848 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
849 node->bytenr, &node->rb_node);
850 spin_unlock(&rc->reloc_root_tree.lock);
851 BUG_ON(rb_node);
852
853 list_add_tail(&root->root_list, &rc->reloc_roots);
854 return 0;
855}
856
857/*
858 * helper to update/delete the 'address of tree root -> reloc tree'
859 * mapping
860 */
861static int __update_reloc_root(struct btrfs_root *root, int del)
862{
863 struct rb_node *rb_node;
864 struct mapping_node *node = NULL;
865 struct reloc_control *rc = root->fs_info->reloc_ctl;
866
867 spin_lock(&rc->reloc_root_tree.lock);
868 rb_node = tree_search(&rc->reloc_root_tree.rb_root,
869 root->commit_root->start);
870 if (rb_node) {
871 node = rb_entry(rb_node, struct mapping_node, rb_node);
872 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
873 }
874 spin_unlock(&rc->reloc_root_tree.lock);
875
876 BUG_ON((struct btrfs_root *)node->data != root);
877
878 if (!del) {
879 spin_lock(&rc->reloc_root_tree.lock);
880 node->bytenr = root->node->start;
881 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
882 node->bytenr, &node->rb_node);
883 spin_unlock(&rc->reloc_root_tree.lock);
884 BUG_ON(rb_node);
885 } else {
886 list_del_init(&root->root_list);
887 kfree(node);
888 }
889 return 0;
890}
891
892/*
893 * create reloc tree for a given fs tree. reloc tree is just a
894 * snapshot of the fs tree with special root objectid.
895 */
896int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
897 struct btrfs_root *root)
898{
899 struct btrfs_root *reloc_root;
900 struct extent_buffer *eb;
901 struct btrfs_root_item *root_item;
902 struct btrfs_key root_key;
903 int ret;
904
905 if (root->reloc_root) {
906 reloc_root = root->reloc_root;
907 reloc_root->last_trans = trans->transid;
908 return 0;
909 }
910
911 if (!root->fs_info->reloc_ctl ||
912 !root->fs_info->reloc_ctl->create_reloc_root ||
913 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
914 return 0;
915
916 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
917 BUG_ON(!root_item);
918
919 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
920 root_key.type = BTRFS_ROOT_ITEM_KEY;
921 root_key.offset = root->root_key.objectid;
922
923 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
924 BTRFS_TREE_RELOC_OBJECTID);
925 BUG_ON(ret);
926
927 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
928 memcpy(root_item, &root->root_item, sizeof(*root_item));
929 btrfs_set_root_refs(root_item, 1);
930 btrfs_set_root_bytenr(root_item, eb->start);
931 btrfs_set_root_level(root_item, btrfs_header_level(eb));
932 btrfs_set_root_generation(root_item, trans->transid);
933 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
934 root_item->drop_level = 0;
935
936 btrfs_tree_unlock(eb);
937 free_extent_buffer(eb);
938
939 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
940 &root_key, root_item);
941 BUG_ON(ret);
942 kfree(root_item);
943
944 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
945 &root_key);
946 BUG_ON(IS_ERR(reloc_root));
947 reloc_root->last_trans = trans->transid;
948
949 __add_reloc_root(reloc_root);
950 root->reloc_root = reloc_root;
951 return 0;
952}
953
954/*
955 * update root item of reloc tree
956 */
957int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
958 struct btrfs_root *root)
959{
960 struct btrfs_root *reloc_root;
961 struct btrfs_root_item *root_item;
962 int del = 0;
963 int ret;
964
965 if (!root->reloc_root)
966 return 0;
967
968 reloc_root = root->reloc_root;
969 root_item = &reloc_root->root_item;
970
971 if (btrfs_root_refs(root_item) == 0) {
972 root->reloc_root = NULL;
973 del = 1;
974 }
975
976 __update_reloc_root(reloc_root, del);
977
978 if (reloc_root->commit_root != reloc_root->node) {
979 btrfs_set_root_node(root_item, reloc_root->node);
980 free_extent_buffer(reloc_root->commit_root);
981 reloc_root->commit_root = btrfs_root_node(reloc_root);
982 }
983
984 ret = btrfs_update_root(trans, root->fs_info->tree_root,
985 &reloc_root->root_key, root_item);
986 BUG_ON(ret);
987 return 0;
988}
989
990/*
991 * helper to find first cached inode with inode number >= objectid
992 * in a subvolume
993 */
994static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
995{
996 struct rb_node *node;
997 struct rb_node *prev;
998 struct btrfs_inode *entry;
999 struct inode *inode;
1000
1001 spin_lock(&root->inode_lock);
1002again:
1003 node = root->inode_tree.rb_node;
1004 prev = NULL;
1005 while (node) {
1006 prev = node;
1007 entry = rb_entry(node, struct btrfs_inode, rb_node);
1008
1009 if (objectid < entry->vfs_inode.i_ino)
1010 node = node->rb_left;
1011 else if (objectid > entry->vfs_inode.i_ino)
1012 node = node->rb_right;
1013 else
1014 break;
1015 }
1016 if (!node) {
1017 while (prev) {
1018 entry = rb_entry(prev, struct btrfs_inode, rb_node);
1019 if (objectid <= entry->vfs_inode.i_ino) {
1020 node = prev;
1021 break;
1022 }
1023 prev = rb_next(prev);
1024 }
1025 }
1026 while (node) {
1027 entry = rb_entry(node, struct btrfs_inode, rb_node);
1028 inode = igrab(&entry->vfs_inode);
1029 if (inode) {
1030 spin_unlock(&root->inode_lock);
1031 return inode;
1032 }
1033
1034 objectid = entry->vfs_inode.i_ino + 1;
1035 if (cond_resched_lock(&root->inode_lock))
1036 goto again;
1037
1038 node = rb_next(node);
1039 }
1040 spin_unlock(&root->inode_lock);
1041 return NULL;
1042}
1043
1044static int in_block_group(u64 bytenr,
1045 struct btrfs_block_group_cache *block_group)
1046{
1047 if (bytenr >= block_group->key.objectid &&
1048 bytenr < block_group->key.objectid + block_group->key.offset)
1049 return 1;
1050 return 0;
1051}
1052
1053/*
1054 * get new location of data
1055 */
1056static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1057 u64 bytenr, u64 num_bytes)
1058{
1059 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1060 struct btrfs_path *path;
1061 struct btrfs_file_extent_item *fi;
1062 struct extent_buffer *leaf;
1063 int ret;
1064
1065 path = btrfs_alloc_path();
1066 if (!path)
1067 return -ENOMEM;
1068
1069 bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1070 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1071 bytenr, 0);
1072 if (ret < 0)
1073 goto out;
1074 if (ret > 0) {
1075 ret = -ENOENT;
1076 goto out;
1077 }
1078
1079 leaf = path->nodes[0];
1080 fi = btrfs_item_ptr(leaf, path->slots[0],
1081 struct btrfs_file_extent_item);
1082
1083 BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1084 btrfs_file_extent_compression(leaf, fi) ||
1085 btrfs_file_extent_encryption(leaf, fi) ||
1086 btrfs_file_extent_other_encoding(leaf, fi));
1087
1088 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1089 ret = 1;
1090 goto out;
1091 }
1092
1093 if (new_bytenr)
1094 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1095 ret = 0;
1096out:
1097 btrfs_free_path(path);
1098 return ret;
1099}
1100
1101/*
1102 * update file extent items in the tree leaf to point to
1103 * the new locations.
1104 */
1105static int replace_file_extents(struct btrfs_trans_handle *trans,
1106 struct reloc_control *rc,
1107 struct btrfs_root *root,
1108 struct extent_buffer *leaf,
1109 struct list_head *inode_list)
1110{
1111 struct btrfs_key key;
1112 struct btrfs_file_extent_item *fi;
1113 struct inode *inode = NULL;
1114 struct inodevec *ivec = NULL;
1115 u64 parent;
1116 u64 bytenr;
1117 u64 new_bytenr;
1118 u64 num_bytes;
1119 u64 end;
1120 u32 nritems;
1121 u32 i;
1122 int ret;
1123 int first = 1;
1124 int dirty = 0;
1125
1126 if (rc->stage != UPDATE_DATA_PTRS)
1127 return 0;
1128
1129 /* reloc trees always use full backref */
1130 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1131 parent = leaf->start;
1132 else
1133 parent = 0;
1134
1135 nritems = btrfs_header_nritems(leaf);
1136 for (i = 0; i < nritems; i++) {
1137 cond_resched();
1138 btrfs_item_key_to_cpu(leaf, &key, i);
1139 if (key.type != BTRFS_EXTENT_DATA_KEY)
1140 continue;
1141 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1142 if (btrfs_file_extent_type(leaf, fi) ==
1143 BTRFS_FILE_EXTENT_INLINE)
1144 continue;
1145 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1146 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1147 if (bytenr == 0)
1148 continue;
1149 if (!in_block_group(bytenr, rc->block_group))
1150 continue;
1151
1152 /*
1153 * if we are modifying block in fs tree, wait for readpage
1154 * to complete and drop the extent cache
1155 */
1156 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1157 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1158 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1159 BUG_ON(!ivec);
1160 ivec->nr = 0;
1161 list_add_tail(&ivec->list, inode_list);
1162 }
1163 if (first) {
1164 inode = find_next_inode(root, key.objectid);
1165 if (inode)
1166 ivec->inode[ivec->nr++] = inode;
1167 first = 0;
1168 } else if (inode && inode->i_ino < key.objectid) {
1169 inode = find_next_inode(root, key.objectid);
1170 if (inode)
1171 ivec->inode[ivec->nr++] = inode;
1172 }
1173 if (inode && inode->i_ino == key.objectid) {
1174 end = key.offset +
1175 btrfs_file_extent_num_bytes(leaf, fi);
1176 WARN_ON(!IS_ALIGNED(key.offset,
1177 root->sectorsize));
1178 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1179 end--;
1180 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1181 key.offset, end,
1182 GFP_NOFS);
1183 if (!ret)
1184 continue;
1185
1186 btrfs_drop_extent_cache(inode, key.offset, end,
1187 1);
1188 unlock_extent(&BTRFS_I(inode)->io_tree,
1189 key.offset, end, GFP_NOFS);
1190 }
1191 }
1192
1193 ret = get_new_location(rc->data_inode, &new_bytenr,
1194 bytenr, num_bytes);
1195 if (ret > 0)
1196 continue;
1197 BUG_ON(ret < 0);
1198
1199 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1200 dirty = 1;
1201
1202 key.offset -= btrfs_file_extent_offset(leaf, fi);
1203 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1204 num_bytes, parent,
1205 btrfs_header_owner(leaf),
1206 key.objectid, key.offset);
1207 BUG_ON(ret);
1208
1209 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1210 parent, btrfs_header_owner(leaf),
1211 key.objectid, key.offset);
1212 BUG_ON(ret);
1213 }
1214 if (dirty)
1215 btrfs_mark_buffer_dirty(leaf);
1216 return 0;
1217}
1218
1219static noinline_for_stack
1220int memcmp_node_keys(struct extent_buffer *eb, int slot,
1221 struct btrfs_path *path, int level)
1222{
1223 struct btrfs_disk_key key1;
1224 struct btrfs_disk_key key2;
1225 btrfs_node_key(eb, &key1, slot);
1226 btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1227 return memcmp(&key1, &key2, sizeof(key1));
1228}
1229
1230/*
1231 * try to replace tree blocks in fs tree with the new blocks
1232 * in reloc tree. tree blocks haven't been modified since the
1233 * reloc tree was create can be replaced.
1234 *
1235 * if a block was replaced, level of the block + 1 is returned.
1236 * if no block got replaced, 0 is returned. if there are other
1237 * errors, a negative error number is returned.
1238 */
1239static int replace_path(struct btrfs_trans_handle *trans,
1240 struct btrfs_root *dest, struct btrfs_root *src,
1241 struct btrfs_path *path, struct btrfs_key *next_key,
1242 struct extent_buffer **leaf,
1243 int lowest_level, int max_level)
1244{
1245 struct extent_buffer *eb;
1246 struct extent_buffer *parent;
1247 struct btrfs_key key;
1248 u64 old_bytenr;
1249 u64 new_bytenr;
1250 u64 old_ptr_gen;
1251 u64 new_ptr_gen;
1252 u64 last_snapshot;
1253 u32 blocksize;
1254 int level;
1255 int ret;
1256 int slot;
1257
1258 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1259 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1260 BUG_ON(lowest_level > 1 && leaf);
1261
1262 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1263
1264 slot = path->slots[lowest_level];
1265 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1266
1267 eb = btrfs_lock_root_node(dest);
1268 btrfs_set_lock_blocking(eb);
1269 level = btrfs_header_level(eb);
1270
1271 if (level < lowest_level) {
1272 btrfs_tree_unlock(eb);
1273 free_extent_buffer(eb);
1274 return 0;
1275 }
1276
1277 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1278 BUG_ON(ret);
1279 btrfs_set_lock_blocking(eb);
1280
1281 if (next_key) {
1282 next_key->objectid = (u64)-1;
1283 next_key->type = (u8)-1;
1284 next_key->offset = (u64)-1;
1285 }
1286
1287 parent = eb;
1288 while (1) {
1289 level = btrfs_header_level(parent);
1290 BUG_ON(level < lowest_level);
1291
1292 ret = btrfs_bin_search(parent, &key, level, &slot);
1293 if (ret && slot > 0)
1294 slot--;
1295
1296 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1297 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1298
1299 old_bytenr = btrfs_node_blockptr(parent, slot);
1300 blocksize = btrfs_level_size(dest, level - 1);
1301 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1302
1303 if (level <= max_level) {
1304 eb = path->nodes[level];
1305 new_bytenr = btrfs_node_blockptr(eb,
1306 path->slots[level]);
1307 new_ptr_gen = btrfs_node_ptr_generation(eb,
1308 path->slots[level]);
1309 } else {
1310 new_bytenr = 0;
1311 new_ptr_gen = 0;
1312 }
1313
1314 if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1315 WARN_ON(1);
1316 ret = level;
1317 break;
1318 }
1319
1320 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1321 memcmp_node_keys(parent, slot, path, level)) {
1322 if (level <= lowest_level && !leaf) {
1323 ret = 0;
1324 break;
1325 }
1326
1327 eb = read_tree_block(dest, old_bytenr, blocksize,
1328 old_ptr_gen);
1329 btrfs_tree_lock(eb);
1330 ret = btrfs_cow_block(trans, dest, eb, parent,
1331 slot, &eb);
1332 BUG_ON(ret);
1333 btrfs_set_lock_blocking(eb);
1334
1335 if (level <= lowest_level) {
1336 *leaf = eb;
1337 ret = 0;
1338 break;
1339 }
1340
1341 btrfs_tree_unlock(parent);
1342 free_extent_buffer(parent);
1343
1344 parent = eb;
1345 continue;
1346 }
1347
1348 btrfs_node_key_to_cpu(path->nodes[level], &key,
1349 path->slots[level]);
1350 btrfs_release_path(src, path);
1351
1352 path->lowest_level = level;
1353 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1354 path->lowest_level = 0;
1355 BUG_ON(ret);
1356
1357 /*
1358 * swap blocks in fs tree and reloc tree.
1359 */
1360 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1361 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1362 btrfs_mark_buffer_dirty(parent);
1363
1364 btrfs_set_node_blockptr(path->nodes[level],
1365 path->slots[level], old_bytenr);
1366 btrfs_set_node_ptr_generation(path->nodes[level],
1367 path->slots[level], old_ptr_gen);
1368 btrfs_mark_buffer_dirty(path->nodes[level]);
1369
1370 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1371 path->nodes[level]->start,
1372 src->root_key.objectid, level - 1, 0);
1373 BUG_ON(ret);
1374 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1375 0, dest->root_key.objectid, level - 1,
1376 0);
1377 BUG_ON(ret);
1378
1379 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1380 path->nodes[level]->start,
1381 src->root_key.objectid, level - 1, 0);
1382 BUG_ON(ret);
1383
1384 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1385 0, dest->root_key.objectid, level - 1,
1386 0);
1387 BUG_ON(ret);
1388
1389 btrfs_unlock_up_safe(path, 0);
1390
1391 ret = level;
1392 break;
1393 }
1394 btrfs_tree_unlock(parent);
1395 free_extent_buffer(parent);
1396 return ret;
1397}
1398
1399/*
1400 * helper to find next relocated block in reloc tree
1401 */
1402static noinline_for_stack
1403int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1404 int *level)
1405{
1406 struct extent_buffer *eb;
1407 int i;
1408 u64 last_snapshot;
1409 u32 nritems;
1410
1411 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1412
1413 for (i = 0; i < *level; i++) {
1414 free_extent_buffer(path->nodes[i]);
1415 path->nodes[i] = NULL;
1416 }
1417
1418 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1419 eb = path->nodes[i];
1420 nritems = btrfs_header_nritems(eb);
1421 while (path->slots[i] + 1 < nritems) {
1422 path->slots[i]++;
1423 if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1424 last_snapshot)
1425 continue;
1426
1427 *level = i;
1428 return 0;
1429 }
1430 free_extent_buffer(path->nodes[i]);
1431 path->nodes[i] = NULL;
1432 }
1433 return 1;
1434}
1435
1436/*
1437 * walk down reloc tree to find relocated block of lowest level
1438 */
1439static noinline_for_stack
1440int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1441 int *level)
1442{
1443 struct extent_buffer *eb = NULL;
1444 int i;
1445 u64 bytenr;
1446 u64 ptr_gen = 0;
1447 u64 last_snapshot;
1448 u32 blocksize;
1449 u32 nritems;
1450
1451 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1452
1453 for (i = *level; i > 0; i--) {
1454 eb = path->nodes[i];
1455 nritems = btrfs_header_nritems(eb);
1456 while (path->slots[i] < nritems) {
1457 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1458 if (ptr_gen > last_snapshot)
1459 break;
1460 path->slots[i]++;
1461 }
1462 if (path->slots[i] >= nritems) {
1463 if (i == *level)
1464 break;
1465 *level = i + 1;
1466 return 0;
1467 }
1468 if (i == 1) {
1469 *level = i;
1470 return 0;
1471 }
1472
1473 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1474 blocksize = btrfs_level_size(root, i - 1);
1475 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1476 BUG_ON(btrfs_header_level(eb) != i - 1);
1477 path->nodes[i - 1] = eb;
1478 path->slots[i - 1] = 0;
1479 }
1480 return 1;
1481}
1482
1483/*
1484 * invalidate extent cache for file extents whose key in range of
1485 * [min_key, max_key)
1486 */
1487static int invalidate_extent_cache(struct btrfs_root *root,
1488 struct btrfs_key *min_key,
1489 struct btrfs_key *max_key)
1490{
1491 struct inode *inode = NULL;
1492 u64 objectid;
1493 u64 start, end;
1494
1495 objectid = min_key->objectid;
1496 while (1) {
1497 cond_resched();
1498 iput(inode);
1499
1500 if (objectid > max_key->objectid)
1501 break;
1502
1503 inode = find_next_inode(root, objectid);
1504 if (!inode)
1505 break;
1506
1507 if (inode->i_ino > max_key->objectid) {
1508 iput(inode);
1509 break;
1510 }
1511
1512 objectid = inode->i_ino + 1;
1513 if (!S_ISREG(inode->i_mode))
1514 continue;
1515
1516 if (unlikely(min_key->objectid == inode->i_ino)) {
1517 if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1518 continue;
1519 if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1520 start = 0;
1521 else {
1522 start = min_key->offset;
1523 WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1524 }
1525 } else {
1526 start = 0;
1527 }
1528
1529 if (unlikely(max_key->objectid == inode->i_ino)) {
1530 if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1531 continue;
1532 if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1533 end = (u64)-1;
1534 } else {
1535 if (max_key->offset == 0)
1536 continue;
1537 end = max_key->offset;
1538 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1539 end--;
1540 }
1541 } else {
1542 end = (u64)-1;
1543 }
1544
1545 /* the lock_extent waits for readpage to complete */
1546 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1547 btrfs_drop_extent_cache(inode, start, end, 1);
1548 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1549 }
1550 return 0;
1551}
1552
1553static int find_next_key(struct btrfs_path *path, int level,
1554 struct btrfs_key *key)
1555
1556{
1557 while (level < BTRFS_MAX_LEVEL) {
1558 if (!path->nodes[level])
1559 break;
1560 if (path->slots[level] + 1 <
1561 btrfs_header_nritems(path->nodes[level])) {
1562 btrfs_node_key_to_cpu(path->nodes[level], key,
1563 path->slots[level] + 1);
1564 return 0;
1565 }
1566 level++;
1567 }
1568 return 1;
1569}
1570
1571/*
1572 * merge the relocated tree blocks in reloc tree with corresponding
1573 * fs tree.
1574 */
1575static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1576 struct btrfs_root *root)
1577{
1578 LIST_HEAD(inode_list);
1579 struct btrfs_key key;
1580 struct btrfs_key next_key;
1581 struct btrfs_trans_handle *trans;
1582 struct btrfs_root *reloc_root;
1583 struct btrfs_root_item *root_item;
1584 struct btrfs_path *path;
1585 struct extent_buffer *leaf = NULL;
1586 unsigned long nr;
1587 int level;
1588 int max_level;
1589 int replaced = 0;
1590 int ret;
1591 int err = 0;
1592
1593 path = btrfs_alloc_path();
1594 if (!path)
1595 return -ENOMEM;
1596
1597 reloc_root = root->reloc_root;
1598 root_item = &reloc_root->root_item;
1599
1600 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1601 level = btrfs_root_level(root_item);
1602 extent_buffer_get(reloc_root->node);
1603 path->nodes[level] = reloc_root->node;
1604 path->slots[level] = 0;
1605 } else {
1606 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1607
1608 level = root_item->drop_level;
1609 BUG_ON(level == 0);
1610 path->lowest_level = level;
1611 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1612 if (ret < 0) {
1613 btrfs_free_path(path);
1614 return ret;
1615 }
1616
1617 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1618 path->slots[level]);
1619 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1620
1621 btrfs_unlock_up_safe(path, 0);
1622 }
1623
1624 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1625 trans = btrfs_start_transaction(root, 1);
1626
1627 leaf = path->nodes[0];
1628 btrfs_item_key_to_cpu(leaf, &key, 0);
1629 btrfs_release_path(reloc_root, path);
1630
1631 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1632 if (ret < 0) {
1633 err = ret;
1634 goto out;
1635 }
1636
1637 leaf = path->nodes[0];
1638 btrfs_unlock_up_safe(path, 1);
1639 ret = replace_file_extents(trans, rc, root, leaf,
1640 &inode_list);
1641 if (ret < 0)
1642 err = ret;
1643 goto out;
1644 }
1645
1646 memset(&next_key, 0, sizeof(next_key));
1647
1648 while (1) {
1649 leaf = NULL;
1650 replaced = 0;
1651 trans = btrfs_start_transaction(root, 1);
1652 max_level = level;
1653
1654 ret = walk_down_reloc_tree(reloc_root, path, &level);
1655 if (ret < 0) {
1656 err = ret;
1657 goto out;
1658 }
1659 if (ret > 0)
1660 break;
1661
1662 if (!find_next_key(path, level, &key) &&
1663 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1664 ret = 0;
1665 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1666 ret = replace_path(trans, root, reloc_root,
1667 path, &next_key, &leaf,
1668 level, max_level);
1669 } else {
1670 ret = replace_path(trans, root, reloc_root,
1671 path, &next_key, NULL,
1672 level, max_level);
1673 }
1674 if (ret < 0) {
1675 err = ret;
1676 goto out;
1677 }
1678
1679 if (ret > 0) {
1680 level = ret;
1681 btrfs_node_key_to_cpu(path->nodes[level], &key,
1682 path->slots[level]);
1683 replaced = 1;
1684 } else if (leaf) {
1685 /*
1686 * no block got replaced, try replacing file extents
1687 */
1688 btrfs_item_key_to_cpu(leaf, &key, 0);
1689 ret = replace_file_extents(trans, rc, root, leaf,
1690 &inode_list);
1691 btrfs_tree_unlock(leaf);
1692 free_extent_buffer(leaf);
1693 BUG_ON(ret < 0);
1694 }
1695
1696 ret = walk_up_reloc_tree(reloc_root, path, &level);
1697 if (ret > 0)
1698 break;
1699
1700 BUG_ON(level == 0);
1701 /*
1702 * save the merging progress in the drop_progress.
1703 * this is OK since root refs == 1 in this case.
1704 */
1705 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1706 path->slots[level]);
1707 root_item->drop_level = level;
1708
1709 nr = trans->blocks_used;
1710 btrfs_end_transaction(trans, root);
1711
1712 btrfs_btree_balance_dirty(root, nr);
1713
1714 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1715 invalidate_extent_cache(root, &key, &next_key);
1716 }
1717
1718 /*
1719 * handle the case only one block in the fs tree need to be
1720 * relocated and the block is tree root.
1721 */
1722 leaf = btrfs_lock_root_node(root);
1723 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1724 btrfs_tree_unlock(leaf);
1725 free_extent_buffer(leaf);
1726 if (ret < 0)
1727 err = ret;
1728out:
1729 btrfs_free_path(path);
1730
1731 if (err == 0) {
1732 memset(&root_item->drop_progress, 0,
1733 sizeof(root_item->drop_progress));
1734 root_item->drop_level = 0;
1735 btrfs_set_root_refs(root_item, 0);
1736 }
1737
1738 nr = trans->blocks_used;
1739 btrfs_end_transaction(trans, root);
1740
1741 btrfs_btree_balance_dirty(root, nr);
1742
1743 /*
1744 * put inodes while we aren't holding the tree locks
1745 */
1746 while (!list_empty(&inode_list)) {
1747 struct inodevec *ivec;
1748 ivec = list_entry(inode_list.next, struct inodevec, list);
1749 list_del(&ivec->list);
1750 while (ivec->nr > 0) {
1751 ivec->nr--;
1752 iput(ivec->inode[ivec->nr]);
1753 }
1754 kfree(ivec);
1755 }
1756
1757 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1758 invalidate_extent_cache(root, &key, &next_key);
1759
1760 return err;
1761}
1762
1763/*
1764 * callback for the work threads.
1765 * this function merges reloc tree with corresponding fs tree,
1766 * and then drops the reloc tree.
1767 */
1768static void merge_func(struct btrfs_work *work)
1769{
1770 struct btrfs_trans_handle *trans;
1771 struct btrfs_root *root;
1772 struct btrfs_root *reloc_root;
1773 struct async_merge *async;
1774
1775 async = container_of(work, struct async_merge, work);
1776 reloc_root = async->root;
1777
1778 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1779 root = read_fs_root(reloc_root->fs_info,
1780 reloc_root->root_key.offset);
1781 BUG_ON(IS_ERR(root));
1782 BUG_ON(root->reloc_root != reloc_root);
1783
1784 merge_reloc_root(async->rc, root);
1785
1786 trans = btrfs_start_transaction(root, 1);
1787 btrfs_update_reloc_root(trans, root);
1788 btrfs_end_transaction(trans, root);
1789 }
1790
1791 btrfs_drop_dead_root(reloc_root);
1792
1793 if (atomic_dec_and_test(async->num_pending))
1794 complete(async->done);
1795
1796 kfree(async);
1797}
1798
1799static int merge_reloc_roots(struct reloc_control *rc)
1800{
1801 struct async_merge *async;
1802 struct btrfs_root *root;
1803 struct completion done;
1804 atomic_t num_pending;
1805
1806 init_completion(&done);
1807 atomic_set(&num_pending, 1);
1808
1809 while (!list_empty(&rc->reloc_roots)) {
1810 root = list_entry(rc->reloc_roots.next,
1811 struct btrfs_root, root_list);
1812 list_del_init(&root->root_list);
1813
1814 async = kmalloc(sizeof(*async), GFP_NOFS);
1815 BUG_ON(!async);
1816 async->work.func = merge_func;
1817 async->work.flags = 0;
1818 async->rc = rc;
1819 async->root = root;
1820 async->done = &done;
1821 async->num_pending = &num_pending;
1822 atomic_inc(&num_pending);
1823 btrfs_queue_worker(&rc->workers, &async->work);
1824 }
1825
1826 if (!atomic_dec_and_test(&num_pending))
1827 wait_for_completion(&done);
1828
1829 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1830 return 0;
1831}
1832
1833static void free_block_list(struct rb_root *blocks)
1834{
1835 struct tree_block *block;
1836 struct rb_node *rb_node;
1837 while ((rb_node = rb_first(blocks))) {
1838 block = rb_entry(rb_node, struct tree_block, rb_node);
1839 rb_erase(rb_node, blocks);
1840 kfree(block);
1841 }
1842}
1843
1844static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1845 struct btrfs_root *reloc_root)
1846{
1847 struct btrfs_root *root;
1848
1849 if (reloc_root->last_trans == trans->transid)
1850 return 0;
1851
1852 root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1853 BUG_ON(IS_ERR(root));
1854 BUG_ON(root->reloc_root != reloc_root);
1855
1856 return btrfs_record_root_in_trans(trans, root);
1857}
1858
1859/*
1860 * select one tree from trees that references the block.
1861 * for blocks in refernce counted trees, we preper reloc tree.
1862 * if no reloc tree found and reloc_only is true, NULL is returned.
1863 */
1864static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1865 struct backref_node *node,
1866 struct backref_edge *edges[],
1867 int *nr, int reloc_only)
1868{
1869 struct backref_node *next;
1870 struct btrfs_root *root;
1871 int index;
1872 int loop = 0;
1873again:
1874 index = 0;
1875 next = node;
1876 while (1) {
1877 cond_resched();
1878 next = walk_up_backref(next, edges, &index);
1879 root = next->root;
1880 if (!root) {
1881 BUG_ON(!node->old_root);
1882 goto skip;
1883 }
1884
1885 /* no other choice for non-refernce counted tree */
1886 if (!root->ref_cows) {
1887 BUG_ON(reloc_only);
1888 break;
1889 }
1890
1891 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1892 record_reloc_root_in_trans(trans, root);
1893 break;
1894 }
1895
1896 if (loop) {
1897 btrfs_record_root_in_trans(trans, root);
1898 break;
1899 }
1900
1901 if (reloc_only || next != node) {
1902 if (!root->reloc_root)
1903 btrfs_record_root_in_trans(trans, root);
1904 root = root->reloc_root;
1905 /*
1906 * if the reloc tree was created in current
1907 * transation, there is no node in backref tree
1908 * corresponds to the root of the reloc tree.
1909 */
1910 if (btrfs_root_last_snapshot(&root->root_item) ==
1911 trans->transid - 1)
1912 break;
1913 }
1914skip:
1915 root = NULL;
1916 next = walk_down_backref(edges, &index);
1917 if (!next || next->level <= node->level)
1918 break;
1919 }
1920
1921 if (!root && !loop && !reloc_only) {
1922 loop = 1;
1923 goto again;
1924 }
1925
1926 if (root)
1927 *nr = index;
1928 else
1929 *nr = 0;
1930
1931 return root;
1932}
1933
1934static noinline_for_stack
1935struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1936 struct backref_node *node)
1937{
1938 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1939 int nr;
1940 return __select_one_root(trans, node, edges, &nr, 0);
1941}
1942
1943static noinline_for_stack
1944struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1945 struct backref_node *node,
1946 struct backref_edge *edges[], int *nr)
1947{
1948 return __select_one_root(trans, node, edges, nr, 1);
1949}
1950
1951static void grab_path_buffers(struct btrfs_path *path,
1952 struct backref_node *node,
1953 struct backref_edge *edges[], int nr)
1954{
1955 int i = 0;
1956 while (1) {
1957 drop_node_buffer(node);
1958 node->eb = path->nodes[node->level];
1959 BUG_ON(!node->eb);
1960 if (path->locks[node->level])
1961 node->locked = 1;
1962 path->nodes[node->level] = NULL;
1963 path->locks[node->level] = 0;
1964
1965 if (i >= nr)
1966 break;
1967
1968 edges[i]->blockptr = node->eb->start;
1969 node = edges[i]->node[UPPER];
1970 i++;
1971 }
1972}
1973
1974/*
1975 * relocate a block tree, and then update pointers in upper level
1976 * blocks that reference the block to point to the new location.
1977 *
1978 * if called by link_to_upper, the block has already been relocated.
1979 * in that case this function just updates pointers.
1980 */
1981static int do_relocation(struct btrfs_trans_handle *trans,
1982 struct backref_node *node,
1983 struct btrfs_key *key,
1984 struct btrfs_path *path, int lowest)
1985{
1986 struct backref_node *upper;
1987 struct backref_edge *edge;
1988 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1989 struct btrfs_root *root;
1990 struct extent_buffer *eb;
1991 u32 blocksize;
1992 u64 bytenr;
1993 u64 generation;
1994 int nr;
1995 int slot;
1996 int ret;
1997 int err = 0;
1998
1999 BUG_ON(lowest && node->eb);
2000
2001 path->lowest_level = node->level + 1;
2002 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2003 cond_resched();
2004 if (node->eb && node->eb->start == edge->blockptr)
2005 continue;
2006
2007 upper = edge->node[UPPER];
2008 root = select_reloc_root(trans, upper, edges, &nr);
2009 if (!root)
2010 continue;
2011
2012 if (upper->eb && !upper->locked)
2013 drop_node_buffer(upper);
2014
2015 if (!upper->eb) {
2016 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2017 if (ret < 0) {
2018 err = ret;
2019 break;
2020 }
2021 BUG_ON(ret > 0);
2022
2023 slot = path->slots[upper->level];
2024
2025 btrfs_unlock_up_safe(path, upper->level + 1);
2026 grab_path_buffers(path, upper, edges, nr);
2027
2028 btrfs_release_path(NULL, path);
2029 } else {
2030 ret = btrfs_bin_search(upper->eb, key, upper->level,
2031 &slot);
2032 BUG_ON(ret);
2033 }
2034
2035 bytenr = btrfs_node_blockptr(upper->eb, slot);
2036 if (!lowest) {
2037 if (node->eb->start == bytenr) {
2038 btrfs_tree_unlock(upper->eb);
2039 upper->locked = 0;
2040 continue;
2041 }
2042 } else {
2043 BUG_ON(node->bytenr != bytenr);
2044 }
2045
2046 blocksize = btrfs_level_size(root, node->level);
2047 generation = btrfs_node_ptr_generation(upper->eb, slot);
2048 eb = read_tree_block(root, bytenr, blocksize, generation);
2049 btrfs_tree_lock(eb);
2050 btrfs_set_lock_blocking(eb);
2051
2052 if (!node->eb) {
2053 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2054 slot, &eb);
2055 if (ret < 0) {
2056 err = ret;
2057 break;
2058 }
2059 btrfs_set_lock_blocking(eb);
2060 node->eb = eb;
2061 node->locked = 1;
2062 } else {
2063 btrfs_set_node_blockptr(upper->eb, slot,
2064 node->eb->start);
2065 btrfs_set_node_ptr_generation(upper->eb, slot,
2066 trans->transid);
2067 btrfs_mark_buffer_dirty(upper->eb);
2068
2069 ret = btrfs_inc_extent_ref(trans, root,
2070 node->eb->start, blocksize,
2071 upper->eb->start,
2072 btrfs_header_owner(upper->eb),
2073 node->level, 0);
2074 BUG_ON(ret);
2075
2076 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2077 BUG_ON(ret);
2078
2079 btrfs_tree_unlock(eb);
2080 free_extent_buffer(eb);
2081 }
2082 if (!lowest) {
2083 btrfs_tree_unlock(upper->eb);
2084 upper->locked = 0;
2085 }
2086 }
2087 path->lowest_level = 0;
2088 return err;
2089}
2090
2091static int link_to_upper(struct btrfs_trans_handle *trans,
2092 struct backref_node *node,
2093 struct btrfs_path *path)
2094{
2095 struct btrfs_key key;
2096 if (!node->eb || list_empty(&node->upper))
2097 return 0;
2098
2099 btrfs_node_key_to_cpu(node->eb, &key, 0);
2100 return do_relocation(trans, node, &key, path, 0);
2101}
2102
2103static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2104 struct backref_cache *cache,
2105 struct btrfs_path *path)
2106{
2107 struct backref_node *node;
2108 int level;
2109 int ret;
2110 int err = 0;
2111
2112 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2113 while (!list_empty(&cache->pending[level])) {
2114 node = list_entry(cache->pending[level].next,
2115 struct backref_node, lower);
2116 BUG_ON(node->level != level);
2117
2118 ret = link_to_upper(trans, node, path);
2119 if (ret < 0)
2120 err = ret;
2121 /*
2122 * this remove the node from the pending list and
2123 * may add some other nodes to the level + 1
2124 * pending list
2125 */
2126 remove_backref_node(cache, node);
2127 }
2128 }
2129 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2130 return err;
2131}
2132
2133static void mark_block_processed(struct reloc_control *rc,
2134 struct backref_node *node)
2135{
2136 u32 blocksize;
2137 if (node->level == 0 ||
2138 in_block_group(node->bytenr, rc->block_group)) {
2139 blocksize = btrfs_level_size(rc->extent_root, node->level);
2140 set_extent_bits(&rc->processed_blocks, node->bytenr,
2141 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2142 GFP_NOFS);
2143 }
2144 node->processed = 1;
2145}
2146
2147/*
2148 * mark a block and all blocks directly/indirectly reference the block
2149 * as processed.
2150 */
2151static void update_processed_blocks(struct reloc_control *rc,
2152 struct backref_node *node)
2153{
2154 struct backref_node *next = node;
2155 struct backref_edge *edge;
2156 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2157 int index = 0;
2158
2159 while (next) {
2160 cond_resched();
2161 while (1) {
2162 if (next->processed)
2163 break;
2164
2165 mark_block_processed(rc, next);
2166
2167 if (list_empty(&next->upper))
2168 break;
2169
2170 edge = list_entry(next->upper.next,
2171 struct backref_edge, list[LOWER]);
2172 edges[index++] = edge;
2173 next = edge->node[UPPER];
2174 }
2175 next = walk_down_backref(edges, &index);
2176 }
2177}
2178
2179static int tree_block_processed(u64 bytenr, u32 blocksize,
2180 struct reloc_control *rc)
2181{
2182 if (test_range_bit(&rc->processed_blocks, bytenr,
2183 bytenr + blocksize - 1, EXTENT_DIRTY, 1))
2184 return 1;
2185 return 0;
2186}
2187
2188/*
2189 * check if there are any file extent pointers in the leaf point to
2190 * data require processing
2191 */
2192static int check_file_extents(struct reloc_control *rc,
2193 u64 bytenr, u32 blocksize, u64 ptr_gen)
2194{
2195 struct btrfs_key found_key;
2196 struct btrfs_file_extent_item *fi;
2197 struct extent_buffer *leaf;
2198 u32 nritems;
2199 int i;
2200 int ret = 0;
2201
2202 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2203
2204 nritems = btrfs_header_nritems(leaf);
2205 for (i = 0; i < nritems; i++) {
2206 cond_resched();
2207 btrfs_item_key_to_cpu(leaf, &found_key, i);
2208 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2209 continue;
2210 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2211 if (btrfs_file_extent_type(leaf, fi) ==
2212 BTRFS_FILE_EXTENT_INLINE)
2213 continue;
2214 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2215 if (bytenr == 0)
2216 continue;
2217 if (in_block_group(bytenr, rc->block_group)) {
2218 ret = 1;
2219 break;
2220 }
2221 }
2222 free_extent_buffer(leaf);
2223 return ret;
2224}
2225
2226/*
2227 * scan child blocks of a given block to find blocks require processing
2228 */
2229static int add_child_blocks(struct btrfs_trans_handle *trans,
2230 struct reloc_control *rc,
2231 struct backref_node *node,
2232 struct rb_root *blocks)
2233{
2234 struct tree_block *block;
2235 struct rb_node *rb_node;
2236 u64 bytenr;
2237 u64 ptr_gen;
2238 u32 blocksize;
2239 u32 nritems;
2240 int i;
2241 int err = 0;
2242
2243 nritems = btrfs_header_nritems(node->eb);
2244 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2245 for (i = 0; i < nritems; i++) {
2246 cond_resched();
2247 bytenr = btrfs_node_blockptr(node->eb, i);
2248 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2249 if (ptr_gen == trans->transid)
2250 continue;
2251 if (!in_block_group(bytenr, rc->block_group) &&
2252 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2253 continue;
2254 if (tree_block_processed(bytenr, blocksize, rc))
2255 continue;
2256
2257 readahead_tree_block(rc->extent_root,
2258 bytenr, blocksize, ptr_gen);
2259 }
2260
2261 for (i = 0; i < nritems; i++) {
2262 cond_resched();
2263 bytenr = btrfs_node_blockptr(node->eb, i);
2264 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2265 if (ptr_gen == trans->transid)
2266 continue;
2267 if (!in_block_group(bytenr, rc->block_group) &&
2268 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2269 continue;
2270 if (tree_block_processed(bytenr, blocksize, rc))
2271 continue;
2272 if (!in_block_group(bytenr, rc->block_group) &&
2273 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2274 continue;
2275
2276 block = kmalloc(sizeof(*block), GFP_NOFS);
2277 if (!block) {
2278 err = -ENOMEM;
2279 break;
2280 }
2281 block->bytenr = bytenr;
2282 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2283 block->level = node->level - 1;
2284 block->key_ready = 1;
2285 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2286 BUG_ON(rb_node);
2287 }
2288 if (err)
2289 free_block_list(blocks);
2290 return err;
2291}
2292
2293/*
2294 * find adjacent blocks require processing
2295 */
2296static noinline_for_stack
2297int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2298 struct reloc_control *rc,
2299 struct backref_cache *cache,
2300 struct rb_root *blocks, int level,
2301 struct backref_node **upper)
2302{
2303 struct backref_node *node;
2304 int ret = 0;
2305
2306 WARN_ON(!list_empty(&cache->pending[level]));
2307
2308 if (list_empty(&cache->pending[level + 1]))
2309 return 1;
2310
2311 node = list_entry(cache->pending[level + 1].next,
2312 struct backref_node, lower);
2313 if (node->eb)
2314 ret = add_child_blocks(trans, rc, node, blocks);
2315
2316 *upper = node;
2317 return ret;
2318}
2319
2320static int get_tree_block_key(struct reloc_control *rc,
2321 struct tree_block *block)
2322{
2323 struct extent_buffer *eb;
2324
2325 BUG_ON(block->key_ready);
2326 eb = read_tree_block(rc->extent_root, block->bytenr,
2327 block->key.objectid, block->key.offset);
2328 WARN_ON(btrfs_header_level(eb) != block->level);
2329 if (block->level == 0)
2330 btrfs_item_key_to_cpu(eb, &block->key, 0);
2331 else
2332 btrfs_node_key_to_cpu(eb, &block->key, 0);
2333 free_extent_buffer(eb);
2334 block->key_ready = 1;
2335 return 0;
2336}
2337
2338static int reada_tree_block(struct reloc_control *rc,
2339 struct tree_block *block)
2340{
2341 BUG_ON(block->key_ready);
2342 readahead_tree_block(rc->extent_root, block->bytenr,
2343 block->key.objectid, block->key.offset);
2344 return 0;
2345}
2346
2347/*
2348 * helper function to relocate a tree block
2349 */
2350static int relocate_tree_block(struct btrfs_trans_handle *trans,
2351 struct reloc_control *rc,
2352 struct backref_node *node,
2353 struct btrfs_key *key,
2354 struct btrfs_path *path)
2355{
2356 struct btrfs_root *root;
2357 int ret;
2358
2359 root = select_one_root(trans, node);
2360 if (unlikely(!root)) {
2361 rc->found_old_snapshot = 1;
2362 update_processed_blocks(rc, node);
2363 return 0;
2364 }
2365
2366 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2367 ret = do_relocation(trans, node, key, path, 1);
2368 if (ret < 0)
2369 goto out;
2370 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2371 ret = replace_file_extents(trans, rc, root,
2372 node->eb, NULL);
2373 if (ret < 0)
2374 goto out;
2375 }
2376 drop_node_buffer(node);
2377 } else if (!root->ref_cows) {
2378 path->lowest_level = node->level;
2379 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2380 btrfs_release_path(root, path);
2381 if (ret < 0)
2382 goto out;
2383 } else if (root != node->root) {
2384 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2385 }
2386
2387 update_processed_blocks(rc, node);
2388 ret = 0;
2389out:
2390 drop_node_buffer(node);
2391 return ret;
2392}
2393
2394/*
2395 * relocate a list of blocks
2396 */
2397static noinline_for_stack
2398int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2399 struct reloc_control *rc, struct rb_root *blocks)
2400{
2401 struct backref_cache *cache;
2402 struct backref_node *node;
2403 struct btrfs_path *path;
2404 struct tree_block *block;
2405 struct rb_node *rb_node;
2406 int level = -1;
2407 int ret;
2408 int err = 0;
2409
2410 path = btrfs_alloc_path();
2411 if (!path)
2412 return -ENOMEM;
2413
2414 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2415 if (!cache) {
2416 btrfs_free_path(path);
2417 return -ENOMEM;
2418 }
2419
2420 backref_cache_init(cache);
2421
2422 rb_node = rb_first(blocks);
2423 while (rb_node) {
2424 block = rb_entry(rb_node, struct tree_block, rb_node);
2425 if (level == -1)
2426 level = block->level;
2427 else
2428 BUG_ON(level != block->level);
2429 if (!block->key_ready)
2430 reada_tree_block(rc, block);
2431 rb_node = rb_next(rb_node);
2432 }
2433
2434 rb_node = rb_first(blocks);
2435 while (rb_node) {
2436 block = rb_entry(rb_node, struct tree_block, rb_node);
2437 if (!block->key_ready)
2438 get_tree_block_key(rc, block);
2439 rb_node = rb_next(rb_node);
2440 }
2441
2442 rb_node = rb_first(blocks);
2443 while (rb_node) {
2444 block = rb_entry(rb_node, struct tree_block, rb_node);
2445
2446 node = build_backref_tree(rc, cache, &block->key,
2447 block->level, block->bytenr);
2448 if (IS_ERR(node)) {
2449 err = PTR_ERR(node);
2450 goto out;
2451 }
2452
2453 ret = relocate_tree_block(trans, rc, node, &block->key,
2454 path);
2455 if (ret < 0) {
2456 err = ret;
2457 goto out;
2458 }
2459 remove_backref_node(cache, node);
2460 rb_node = rb_next(rb_node);
2461 }
2462
2463 if (level > 0)
2464 goto out;
2465
2466 free_block_list(blocks);
2467
2468 /*
2469 * now backrefs of some upper level tree blocks have been cached,
2470 * try relocating blocks referenced by these upper level blocks.
2471 */
2472 while (1) {
2473 struct backref_node *upper = NULL;
2474 if (trans->transaction->in_commit ||
2475 trans->transaction->delayed_refs.flushing)
2476 break;
2477
2478 ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2479 &upper);
2480 if (ret < 0)
2481 err = ret;
2482 if (ret != 0)
2483 break;
2484
2485 rb_node = rb_first(blocks);
2486 while (rb_node) {
2487 block = rb_entry(rb_node, struct tree_block, rb_node);
2488 if (trans->transaction->in_commit ||
2489 trans->transaction->delayed_refs.flushing)
2490 goto out;
2491 BUG_ON(!block->key_ready);
2492 node = build_backref_tree(rc, cache, &block->key,
2493 level, block->bytenr);
2494 if (IS_ERR(node)) {
2495 err = PTR_ERR(node);
2496 goto out;
2497 }
2498
2499 ret = relocate_tree_block(trans, rc, node,
2500 &block->key, path);
2501 if (ret < 0) {
2502 err = ret;
2503 goto out;
2504 }
2505 remove_backref_node(cache, node);
2506 rb_node = rb_next(rb_node);
2507 }
2508 free_block_list(blocks);
2509
2510 if (upper) {
2511 ret = link_to_upper(trans, upper, path);
2512 if (ret < 0) {
2513 err = ret;
2514 break;
2515 }
2516 remove_backref_node(cache, upper);
2517 }
2518 }
2519out:
2520 free_block_list(blocks);
2521
2522 ret = finish_pending_nodes(trans, cache, path);
2523 if (ret < 0)
2524 err = ret;
2525
2526 kfree(cache);
2527 btrfs_free_path(path);
2528 return err;
2529}
2530
2531static noinline_for_stack
2532int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
2533{
2534 u64 page_start;
2535 u64 page_end;
2536 unsigned long i;
2537 unsigned long first_index;
2538 unsigned long last_index;
2539 unsigned int total_read = 0;
2540 unsigned int total_dirty = 0;
2541 struct page *page;
2542 struct file_ra_state *ra;
2543 struct btrfs_ordered_extent *ordered;
2544 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2545 int ret = 0;
2546
2547 ra = kzalloc(sizeof(*ra), GFP_NOFS);
2548 if (!ra)
2549 return -ENOMEM;
2550
2551 mutex_lock(&inode->i_mutex);
2552 first_index = start >> PAGE_CACHE_SHIFT;
2553 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2554
2555 /* make sure the dirty trick played by the caller work */
2556 ret = invalidate_inode_pages2_range(inode->i_mapping,
2557 first_index, last_index);
2558 if (ret)
2559 goto out_unlock;
2560
2561 file_ra_state_init(ra, inode->i_mapping);
2562
2563 for (i = first_index ; i <= last_index; i++) {
2564 if (total_read % ra->ra_pages == 0) {
2565 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
2566 min(last_index, ra->ra_pages + i - 1));
2567 }
2568 total_read++;
2569again:
2570 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
2571 BUG_ON(1);
2572 page = grab_cache_page(inode->i_mapping, i);
2573 if (!page) {
2574 ret = -ENOMEM;
2575 goto out_unlock;
2576 }
2577 if (!PageUptodate(page)) {
2578 btrfs_readpage(NULL, page);
2579 lock_page(page);
2580 if (!PageUptodate(page)) {
2581 unlock_page(page);
2582 page_cache_release(page);
2583 ret = -EIO;
2584 goto out_unlock;
2585 }
2586 }
2587 wait_on_page_writeback(page);
2588
2589 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2590 page_end = page_start + PAGE_CACHE_SIZE - 1;
2591 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2592
2593 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2594 if (ordered) {
2595 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2596 unlock_page(page);
2597 page_cache_release(page);
2598 btrfs_start_ordered_extent(inode, ordered, 1);
2599 btrfs_put_ordered_extent(ordered);
2600 goto again;
2601 }
2602 set_page_extent_mapped(page);
2603
2604 if (i == first_index)
2605 set_extent_bits(io_tree, page_start, page_end,
2606 EXTENT_BOUNDARY, GFP_NOFS);
2607 btrfs_set_extent_delalloc(inode, page_start, page_end);
2608
2609 set_page_dirty(page);
2610 total_dirty++;
2611
2612 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2613 unlock_page(page);
2614 page_cache_release(page);
2615 }
2616out_unlock:
2617 mutex_unlock(&inode->i_mutex);
2618 kfree(ra);
2619 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
2620 return ret;
2621}
2622
2623static noinline_for_stack
2624int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
2625{
2626 struct btrfs_root *root = BTRFS_I(inode)->root;
2627 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2628 struct extent_map *em;
2629 u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
2630 u64 end = start + extent_key->offset - 1;
2631
2632 em = alloc_extent_map(GFP_NOFS);
2633 em->start = start;
2634 em->len = extent_key->offset;
2635 em->block_len = extent_key->offset;
2636 em->block_start = extent_key->objectid;
2637 em->bdev = root->fs_info->fs_devices->latest_bdev;
2638 set_bit(EXTENT_FLAG_PINNED, &em->flags);
2639
2640 /* setup extent map to cheat btrfs_readpage */
2641 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2642 while (1) {
2643 int ret;
2644 spin_lock(&em_tree->lock);
2645 ret = add_extent_mapping(em_tree, em);
2646 spin_unlock(&em_tree->lock);
2647 if (ret != -EEXIST) {
2648 free_extent_map(em);
2649 break;
2650 }
2651 btrfs_drop_extent_cache(inode, start, end, 0);
2652 }
2653 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2654
2655 return relocate_inode_pages(inode, start, extent_key->offset);
2656}
2657
2658#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2659static int get_ref_objectid_v0(struct reloc_control *rc,
2660 struct btrfs_path *path,
2661 struct btrfs_key *extent_key,
2662 u64 *ref_objectid, int *path_change)
2663{
2664 struct btrfs_key key;
2665 struct extent_buffer *leaf;
2666 struct btrfs_extent_ref_v0 *ref0;
2667 int ret;
2668 int slot;
2669
2670 leaf = path->nodes[0];
2671 slot = path->slots[0];
2672 while (1) {
2673 if (slot >= btrfs_header_nritems(leaf)) {
2674 ret = btrfs_next_leaf(rc->extent_root, path);
2675 if (ret < 0)
2676 return ret;
2677 BUG_ON(ret > 0);
2678 leaf = path->nodes[0];
2679 slot = path->slots[0];
2680 if (path_change)
2681 *path_change = 1;
2682 }
2683 btrfs_item_key_to_cpu(leaf, &key, slot);
2684 if (key.objectid != extent_key->objectid)
2685 return -ENOENT;
2686
2687 if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2688 slot++;
2689 continue;
2690 }
2691 ref0 = btrfs_item_ptr(leaf, slot,
2692 struct btrfs_extent_ref_v0);
2693 *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2694 break;
2695 }
2696 return 0;
2697}
2698#endif
2699
2700/*
2701 * helper to add a tree block to the list.
2702 * the major work is getting the generation and level of the block
2703 */
2704static int add_tree_block(struct reloc_control *rc,
2705 struct btrfs_key *extent_key,
2706 struct btrfs_path *path,
2707 struct rb_root *blocks)
2708{
2709 struct extent_buffer *eb;
2710 struct btrfs_extent_item *ei;
2711 struct btrfs_tree_block_info *bi;
2712 struct tree_block *block;
2713 struct rb_node *rb_node;
2714 u32 item_size;
2715 int level = -1;
2716 int generation;
2717
2718 eb = path->nodes[0];
2719 item_size = btrfs_item_size_nr(eb, path->slots[0]);
2720
2721 if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2722 ei = btrfs_item_ptr(eb, path->slots[0],
2723 struct btrfs_extent_item);
2724 bi = (struct btrfs_tree_block_info *)(ei + 1);
2725 generation = btrfs_extent_generation(eb, ei);
2726 level = btrfs_tree_block_level(eb, bi);
2727 } else {
2728#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2729 u64 ref_owner;
2730 int ret;
2731
2732 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2733 ret = get_ref_objectid_v0(rc, path, extent_key,
2734 &ref_owner, NULL);
2735 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2736 level = (int)ref_owner;
2737 /* FIXME: get real generation */
2738 generation = 0;
2739#else
2740 BUG();
2741#endif
2742 }
2743
2744 btrfs_release_path(rc->extent_root, path);
2745
2746 BUG_ON(level == -1);
2747
2748 block = kmalloc(sizeof(*block), GFP_NOFS);
2749 if (!block)
2750 return -ENOMEM;
2751
2752 block->bytenr = extent_key->objectid;
2753 block->key.objectid = extent_key->offset;
2754 block->key.offset = generation;
2755 block->level = level;
2756 block->key_ready = 0;
2757
2758 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2759 BUG_ON(rb_node);
2760
2761 return 0;
2762}
2763
2764/*
2765 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2766 */
2767static int __add_tree_block(struct reloc_control *rc,
2768 u64 bytenr, u32 blocksize,
2769 struct rb_root *blocks)
2770{
2771 struct btrfs_path *path;
2772 struct btrfs_key key;
2773 int ret;
2774
2775 if (tree_block_processed(bytenr, blocksize, rc))
2776 return 0;
2777
2778 if (tree_search(blocks, bytenr))
2779 return 0;
2780
2781 path = btrfs_alloc_path();
2782 if (!path)
2783 return -ENOMEM;
2784
2785 key.objectid = bytenr;
2786 key.type = BTRFS_EXTENT_ITEM_KEY;
2787 key.offset = blocksize;
2788
2789 path->search_commit_root = 1;
2790 path->skip_locking = 1;
2791 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2792 if (ret < 0)
2793 goto out;
2794 BUG_ON(ret);
2795
2796 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2797 ret = add_tree_block(rc, &key, path, blocks);
2798out:
2799 btrfs_free_path(path);
2800 return ret;
2801}
2802
2803/*
2804 * helper to check if the block use full backrefs for pointers in it
2805 */
2806static int block_use_full_backref(struct reloc_control *rc,
2807 struct extent_buffer *eb)
2808{
2809 struct btrfs_path *path;
2810 struct btrfs_extent_item *ei;
2811 struct btrfs_key key;
2812 u64 flags;
2813 int ret;
2814
2815 if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2816 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2817 return 1;
2818
2819 path = btrfs_alloc_path();
2820 BUG_ON(!path);
2821
2822 key.objectid = eb->start;
2823 key.type = BTRFS_EXTENT_ITEM_KEY;
2824 key.offset = eb->len;
2825
2826 path->search_commit_root = 1;
2827 path->skip_locking = 1;
2828 ret = btrfs_search_slot(NULL, rc->extent_root,
2829 &key, path, 0, 0);
2830 BUG_ON(ret);
2831
2832 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2833 struct btrfs_extent_item);
2834 flags = btrfs_extent_flags(path->nodes[0], ei);
2835 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2836 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2837 ret = 1;
2838 else
2839 ret = 0;
2840 btrfs_free_path(path);
2841 return ret;
2842}
2843
2844/*
2845 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2846 * this function scans fs tree to find blocks reference the data extent
2847 */
2848static int find_data_references(struct reloc_control *rc,
2849 struct btrfs_key *extent_key,
2850 struct extent_buffer *leaf,
2851 struct btrfs_extent_data_ref *ref,
2852 struct rb_root *blocks)
2853{
2854 struct btrfs_path *path;
2855 struct tree_block *block;
2856 struct btrfs_root *root;
2857 struct btrfs_file_extent_item *fi;
2858 struct rb_node *rb_node;
2859 struct btrfs_key key;
2860 u64 ref_root;
2861 u64 ref_objectid;
2862 u64 ref_offset;
2863 u32 ref_count;
2864 u32 nritems;
2865 int err = 0;
2866 int added = 0;
2867 int counted;
2868 int ret;
2869
2870 path = btrfs_alloc_path();
2871 if (!path)
2872 return -ENOMEM;
2873
2874 ref_root = btrfs_extent_data_ref_root(leaf, ref);
2875 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2876 ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2877 ref_count = btrfs_extent_data_ref_count(leaf, ref);
2878
2879 root = read_fs_root(rc->extent_root->fs_info, ref_root);
2880 if (IS_ERR(root)) {
2881 err = PTR_ERR(root);
2882 goto out;
2883 }
2884
2885 key.objectid = ref_objectid;
2886 key.offset = ref_offset;
2887 key.type = BTRFS_EXTENT_DATA_KEY;
2888
2889 path->search_commit_root = 1;
2890 path->skip_locking = 1;
2891 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2892 if (ret < 0) {
2893 err = ret;
2894 goto out;
2895 }
2896
2897 leaf = path->nodes[0];
2898 nritems = btrfs_header_nritems(leaf);
2899 /*
2900 * the references in tree blocks that use full backrefs
2901 * are not counted in
2902 */
2903 if (block_use_full_backref(rc, leaf))
2904 counted = 0;
2905 else
2906 counted = 1;
2907 rb_node = tree_search(blocks, leaf->start);
2908 if (rb_node) {
2909 if (counted)
2910 added = 1;
2911 else
2912 path->slots[0] = nritems;
2913 }
2914
2915 while (ref_count > 0) {
2916 while (path->slots[0] >= nritems) {
2917 ret = btrfs_next_leaf(root, path);
2918 if (ret < 0) {
2919 err = ret;
2920 goto out;
2921 }
2922 if (ret > 0) {
2923 WARN_ON(1);
2924 goto out;
2925 }
2926
2927 leaf = path->nodes[0];
2928 nritems = btrfs_header_nritems(leaf);
2929 added = 0;
2930
2931 if (block_use_full_backref(rc, leaf))
2932 counted = 0;
2933 else
2934 counted = 1;
2935 rb_node = tree_search(blocks, leaf->start);
2936 if (rb_node) {
2937 if (counted)
2938 added = 1;
2939 else
2940 path->slots[0] = nritems;
2941 }
2942 }
2943
2944 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2945 if (key.objectid != ref_objectid ||
2946 key.type != BTRFS_EXTENT_DATA_KEY) {
2947 WARN_ON(1);
2948 break;
2949 }
2950
2951 fi = btrfs_item_ptr(leaf, path->slots[0],
2952 struct btrfs_file_extent_item);
2953
2954 if (btrfs_file_extent_type(leaf, fi) ==
2955 BTRFS_FILE_EXTENT_INLINE)
2956 goto next;
2957
2958 if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
2959 extent_key->objectid)
2960 goto next;
2961
2962 key.offset -= btrfs_file_extent_offset(leaf, fi);
2963 if (key.offset != ref_offset)
2964 goto next;
2965
2966 if (counted)
2967 ref_count--;
2968 if (added)
2969 goto next;
2970
2971 if (!tree_block_processed(leaf->start, leaf->len, rc)) {
2972 block = kmalloc(sizeof(*block), GFP_NOFS);
2973 if (!block) {
2974 err = -ENOMEM;
2975 break;
2976 }
2977 block->bytenr = leaf->start;
2978 btrfs_item_key_to_cpu(leaf, &block->key, 0);
2979 block->level = 0;
2980 block->key_ready = 1;
2981 rb_node = tree_insert(blocks, block->bytenr,
2982 &block->rb_node);
2983 BUG_ON(rb_node);
2984 }
2985 if (counted)
2986 added = 1;
2987 else
2988 path->slots[0] = nritems;
2989next:
2990 path->slots[0]++;
2991
2992 }
2993out:
2994 btrfs_free_path(path);
2995 return err;
2996}
2997
2998/*
2999 * hepler to find all tree blocks that reference a given data extent
3000 */
3001static noinline_for_stack
3002int add_data_references(struct reloc_control *rc,
3003 struct btrfs_key *extent_key,
3004 struct btrfs_path *path,
3005 struct rb_root *blocks)
3006{
3007 struct btrfs_key key;
3008 struct extent_buffer *eb;
3009 struct btrfs_extent_data_ref *dref;
3010 struct btrfs_extent_inline_ref *iref;
3011 unsigned long ptr;
3012 unsigned long end;
3013 u32 blocksize;
3014 int ret;
3015 int err = 0;
3016
3017 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3018 extent_key->offset);
3019 BUG_ON(ret < 0);
3020 if (ret > 0) {
3021 /* the relocated data is fragmented */
3022 rc->extents_skipped++;
3023 btrfs_release_path(rc->extent_root, path);
3024 return 0;
3025 }
3026
3027 blocksize = btrfs_level_size(rc->extent_root, 0);
3028
3029 eb = path->nodes[0];
3030 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3031 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3032#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3033 if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3034 ptr = end;
3035 else
3036#endif
3037 ptr += sizeof(struct btrfs_extent_item);
3038
3039 while (ptr < end) {
3040 iref = (struct btrfs_extent_inline_ref *)ptr;
3041 key.type = btrfs_extent_inline_ref_type(eb, iref);
3042 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3043 key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3044 ret = __add_tree_block(rc, key.offset, blocksize,
3045 blocks);
3046 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3047 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3048 ret = find_data_references(rc, extent_key,
3049 eb, dref, blocks);
3050 } else {
3051 BUG();
3052 }
3053 ptr += btrfs_extent_inline_ref_size(key.type);
3054 }
3055 WARN_ON(ptr > end);
3056
3057 while (1) {
3058 cond_resched();
3059 eb = path->nodes[0];
3060 if (path->slots[0] >= btrfs_header_nritems(eb)) {
3061 ret = btrfs_next_leaf(rc->extent_root, path);
3062 if (ret < 0) {
3063 err = ret;
3064 break;
3065 }
3066 if (ret > 0)
3067 break;
3068 eb = path->nodes[0];
3069 }
3070
3071 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3072 if (key.objectid != extent_key->objectid)
3073 break;
3074
3075#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3076 if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3077 key.type == BTRFS_EXTENT_REF_V0_KEY) {
3078#else
3079 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3080 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3081#endif
3082 ret = __add_tree_block(rc, key.offset, blocksize,
3083 blocks);
3084 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3085 dref = btrfs_item_ptr(eb, path->slots[0],
3086 struct btrfs_extent_data_ref);
3087 ret = find_data_references(rc, extent_key,
3088 eb, dref, blocks);
3089 } else {
3090 ret = 0;
3091 }
3092 if (ret) {
3093 err = ret;
3094 break;
3095 }
3096 path->slots[0]++;
3097 }
3098 btrfs_release_path(rc->extent_root, path);
3099 if (err)
3100 free_block_list(blocks);
3101 return err;
3102}
3103
3104/*
3105 * hepler to find next unprocessed extent
3106 */
3107static noinline_for_stack
3108int find_next_extent(struct btrfs_trans_handle *trans,
3109 struct reloc_control *rc, struct btrfs_path *path)
3110{
3111 struct btrfs_key key;
3112 struct extent_buffer *leaf;
3113 u64 start, end, last;
3114 int ret;
3115
3116 last = rc->block_group->key.objectid + rc->block_group->key.offset;
3117 while (1) {
3118 cond_resched();
3119 if (rc->search_start >= last) {
3120 ret = 1;
3121 break;
3122 }
3123
3124 key.objectid = rc->search_start;
3125 key.type = BTRFS_EXTENT_ITEM_KEY;
3126 key.offset = 0;
3127
3128 path->search_commit_root = 1;
3129 path->skip_locking = 1;
3130 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3131 0, 0);
3132 if (ret < 0)
3133 break;
3134next:
3135 leaf = path->nodes[0];
3136 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3137 ret = btrfs_next_leaf(rc->extent_root, path);
3138 if (ret != 0)
3139 break;
3140 leaf = path->nodes[0];
3141 }
3142
3143 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3144 if (key.objectid >= last) {
3145 ret = 1;
3146 break;
3147 }
3148
3149 if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3150 key.objectid + key.offset <= rc->search_start) {
3151 path->slots[0]++;
3152 goto next;
3153 }
3154
3155 ret = find_first_extent_bit(&rc->processed_blocks,
3156 key.objectid, &start, &end,
3157 EXTENT_DIRTY);
3158
3159 if (ret == 0 && start <= key.objectid) {
3160 btrfs_release_path(rc->extent_root, path);
3161 rc->search_start = end + 1;
3162 } else {
3163 rc->search_start = key.objectid + key.offset;
3164 return 0;
3165 }
3166 }
3167 btrfs_release_path(rc->extent_root, path);
3168 return ret;
3169}
3170
3171static void set_reloc_control(struct reloc_control *rc)
3172{
3173 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3174 mutex_lock(&fs_info->trans_mutex);
3175 fs_info->reloc_ctl = rc;
3176 mutex_unlock(&fs_info->trans_mutex);
3177}
3178
3179static void unset_reloc_control(struct reloc_control *rc)
3180{
3181 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3182 mutex_lock(&fs_info->trans_mutex);
3183 fs_info->reloc_ctl = NULL;
3184 mutex_unlock(&fs_info->trans_mutex);
3185}
3186
3187static int check_extent_flags(u64 flags)
3188{
3189 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3190 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3191 return 1;
3192 if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3193 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3194 return 1;
3195 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3196 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3197 return 1;
3198 return 0;
3199}
3200
3201static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3202{
3203 struct rb_root blocks = RB_ROOT;
3204 struct btrfs_key key;
3205 struct btrfs_trans_handle *trans = NULL;
3206 struct btrfs_path *path;
3207 struct btrfs_extent_item *ei;
3208 unsigned long nr;
3209 u64 flags;
3210 u32 item_size;
3211 int ret;
3212 int err = 0;
3213
3214 path = btrfs_alloc_path();
3215 if (!path)
3216 return -ENOMEM;
3217
3218 rc->search_start = rc->block_group->key.objectid;
3219 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3220 GFP_NOFS);
3221
3222 rc->create_reloc_root = 1;
3223 set_reloc_control(rc);
3224
3225 trans = btrfs_start_transaction(rc->extent_root, 1);
3226 btrfs_commit_transaction(trans, rc->extent_root);
3227
3228 while (1) {
3229 trans = btrfs_start_transaction(rc->extent_root, 1);
3230
3231 ret = find_next_extent(trans, rc, path);
3232 if (ret < 0)
3233 err = ret;
3234 if (ret != 0)
3235 break;
3236
3237 rc->extents_found++;
3238
3239 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3240 struct btrfs_extent_item);
3241 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3242 item_size = btrfs_item_size_nr(path->nodes[0],
3243 path->slots[0]);
3244 if (item_size >= sizeof(*ei)) {
3245 flags = btrfs_extent_flags(path->nodes[0], ei);
3246 ret = check_extent_flags(flags);
3247 BUG_ON(ret);
3248
3249 } else {
3250#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3251 u64 ref_owner;
3252 int path_change = 0;
3253
3254 BUG_ON(item_size !=
3255 sizeof(struct btrfs_extent_item_v0));
3256 ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3257 &path_change);
3258 if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3259 flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3260 else
3261 flags = BTRFS_EXTENT_FLAG_DATA;
3262
3263 if (path_change) {
3264 btrfs_release_path(rc->extent_root, path);
3265
3266 path->search_commit_root = 1;
3267 path->skip_locking = 1;
3268 ret = btrfs_search_slot(NULL, rc->extent_root,
3269 &key, path, 0, 0);
3270 if (ret < 0) {
3271 err = ret;
3272 break;
3273 }
3274 BUG_ON(ret > 0);
3275 }
3276#else
3277 BUG();
3278#endif
3279 }
3280
3281 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3282 ret = add_tree_block(rc, &key, path, &blocks);
3283 } else if (rc->stage == UPDATE_DATA_PTRS &&
3284 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3285 ret = add_data_references(rc, &key, path, &blocks);
3286 } else {
3287 btrfs_release_path(rc->extent_root, path);
3288 ret = 0;
3289 }
3290 if (ret < 0) {
3291 err = 0;
3292 break;
3293 }
3294
3295 if (!RB_EMPTY_ROOT(&blocks)) {
3296 ret = relocate_tree_blocks(trans, rc, &blocks);
3297 if (ret < 0) {
3298 err = ret;
3299 break;
3300 }
3301 }
3302
3303 nr = trans->blocks_used;
3304 btrfs_end_transaction_throttle(trans, rc->extent_root);
3305 trans = NULL;
3306 btrfs_btree_balance_dirty(rc->extent_root, nr);
3307
3308 if (rc->stage == MOVE_DATA_EXTENTS &&
3309 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3310 rc->found_file_extent = 1;
3311 ret = relocate_data_extent(rc->data_inode, &key);
3312 if (ret < 0) {
3313 err = ret;
3314 break;
3315 }
3316 }
3317 }
3318 btrfs_free_path(path);
3319
3320 if (trans) {
3321 nr = trans->blocks_used;
3322 btrfs_end_transaction(trans, rc->extent_root);
3323 btrfs_btree_balance_dirty(rc->extent_root, nr);
3324 }
3325
3326 rc->create_reloc_root = 0;
3327 smp_mb();
3328
3329 if (rc->extents_found > 0) {
3330 trans = btrfs_start_transaction(rc->extent_root, 1);
3331 btrfs_commit_transaction(trans, rc->extent_root);
3332 }
3333
3334 merge_reloc_roots(rc);
3335
3336 unset_reloc_control(rc);
3337
3338 /* get rid of pinned extents */
3339 trans = btrfs_start_transaction(rc->extent_root, 1);
3340 btrfs_commit_transaction(trans, rc->extent_root);
3341
3342 return err;
3343}
3344
3345static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3346 struct btrfs_root *root,
3347 u64 objectid, u64 size)
3348{
3349 struct btrfs_path *path;
3350 struct btrfs_inode_item *item;
3351 struct extent_buffer *leaf;
3352 int ret;
3353
3354 path = btrfs_alloc_path();
3355 if (!path)
3356 return -ENOMEM;
3357
3358 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3359 if (ret)
3360 goto out;
3361
3362 leaf = path->nodes[0];
3363 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3364 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3365 btrfs_set_inode_generation(leaf, item, 1);
3366 btrfs_set_inode_size(leaf, item, size);
3367 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3368 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3369 btrfs_mark_buffer_dirty(leaf);
3370 btrfs_release_path(root, path);
3371out:
3372 btrfs_free_path(path);
3373 return ret;
3374}
3375
3376/*
3377 * helper to create inode for data relocation.
3378 * the inode is in data relocation tree and its link count is 0
3379 */
3380static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3381 struct btrfs_block_group_cache *group)
3382{
3383 struct inode *inode = NULL;
3384 struct btrfs_trans_handle *trans;
3385 struct btrfs_root *root;
3386 struct btrfs_key key;
3387 unsigned long nr;
3388 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3389 int err = 0;
3390
3391 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3392 if (IS_ERR(root))
3393 return ERR_CAST(root);
3394
3395 trans = btrfs_start_transaction(root, 1);
3396 BUG_ON(!trans);
3397
3398 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3399 if (err)
3400 goto out;
3401
3402 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
3403 BUG_ON(err);
3404
3405 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
3406 group->key.offset, 0, group->key.offset,
3407 0, 0, 0);
3408 BUG_ON(err);
3409
3410 key.objectid = objectid;
3411 key.type = BTRFS_INODE_ITEM_KEY;
3412 key.offset = 0;
3413 inode = btrfs_iget(root->fs_info->sb, &key, root);
3414 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3415 BTRFS_I(inode)->index_cnt = group->key.objectid;
3416
3417 err = btrfs_orphan_add(trans, inode);
3418out:
3419 nr = trans->blocks_used;
3420 btrfs_end_transaction(trans, root);
3421
3422 btrfs_btree_balance_dirty(root, nr);
3423 if (err) {
3424 if (inode)
3425 iput(inode);
3426 inode = ERR_PTR(err);
3427 }
3428 return inode;
3429}
3430
3431/*
3432 * function to relocate all extents in a block group.
3433 */
3434int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3435{
3436 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3437 struct reloc_control *rc;
3438 int ret;
3439 int err = 0;
3440
3441 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3442 if (!rc)
3443 return -ENOMEM;
3444
3445 mapping_tree_init(&rc->reloc_root_tree);
3446 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3447 INIT_LIST_HEAD(&rc->reloc_roots);
3448
3449 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3450 BUG_ON(!rc->block_group);
3451
3452 btrfs_init_workers(&rc->workers, "relocate",
3453 fs_info->thread_pool_size);
3454
3455 rc->extent_root = extent_root;
3456 btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
3457
3458 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3459 if (IS_ERR(rc->data_inode)) {
3460 err = PTR_ERR(rc->data_inode);
3461 rc->data_inode = NULL;
3462 goto out;
3463 }
3464
3465 printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3466 (unsigned long long)rc->block_group->key.objectid,
3467 (unsigned long long)rc->block_group->flags);
3468
3469 btrfs_start_delalloc_inodes(fs_info->tree_root);
3470 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
3471
3472 while (1) {
3473 mutex_lock(&fs_info->cleaner_mutex);
3474 btrfs_clean_old_snapshots(fs_info->tree_root);
3475 mutex_unlock(&fs_info->cleaner_mutex);
3476
3477 rc->extents_found = 0;
3478 rc->extents_skipped = 0;
3479
3480 ret = relocate_block_group(rc);
3481 if (ret < 0) {
3482 err = ret;
3483 break;
3484 }
3485
3486 if (rc->extents_found == 0)
3487 break;
3488
3489 printk(KERN_INFO "btrfs: found %llu extents\n",
3490 (unsigned long long)rc->extents_found);
3491
3492 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3493 btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3494 invalidate_mapping_pages(rc->data_inode->i_mapping,
3495 0, -1);
3496 rc->stage = UPDATE_DATA_PTRS;
3497 } else if (rc->stage == UPDATE_DATA_PTRS &&
3498 rc->extents_skipped >= rc->extents_found) {
3499 iput(rc->data_inode);
3500 rc->data_inode = create_reloc_inode(fs_info,
3501 rc->block_group);
3502 if (IS_ERR(rc->data_inode)) {
3503 err = PTR_ERR(rc->data_inode);
3504 rc->data_inode = NULL;
3505 break;
3506 }
3507 rc->stage = MOVE_DATA_EXTENTS;
3508 rc->found_file_extent = 0;
3509 }
3510 }
3511
3512 filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
3513 rc->block_group->key.objectid,
3514 rc->block_group->key.objectid +
3515 rc->block_group->key.offset - 1);
3516
3517 WARN_ON(rc->block_group->pinned > 0);
3518 WARN_ON(rc->block_group->reserved > 0);
3519 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3520out:
3521 iput(rc->data_inode);
3522 btrfs_stop_workers(&rc->workers);
3523 btrfs_put_block_group(rc->block_group);
3524 kfree(rc);
3525 return err;
3526}
3527
3528/*
3529 * recover relocation interrupted by system crash.
3530 *
3531 * this function resumes merging reloc trees with corresponding fs trees.
3532 * this is important for keeping the sharing of tree blocks
3533 */
3534int btrfs_recover_relocation(struct btrfs_root *root)
3535{
3536 LIST_HEAD(reloc_roots);
3537 struct btrfs_key key;
3538 struct btrfs_root *fs_root;
3539 struct btrfs_root *reloc_root;
3540 struct btrfs_path *path;
3541 struct extent_buffer *leaf;
3542 struct reloc_control *rc = NULL;
3543 struct btrfs_trans_handle *trans;
3544 int ret;
3545 int err = 0;
3546
3547 path = btrfs_alloc_path();
3548 if (!path)
3549 return -ENOMEM;
3550
3551 key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3552 key.type = BTRFS_ROOT_ITEM_KEY;
3553 key.offset = (u64)-1;
3554
3555 while (1) {
3556 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3557 path, 0, 0);
3558 if (ret < 0) {
3559 err = ret;
3560 goto out;
3561 }
3562 if (ret > 0) {
3563 if (path->slots[0] == 0)
3564 break;
3565 path->slots[0]--;
3566 }
3567 leaf = path->nodes[0];
3568 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3569 btrfs_release_path(root->fs_info->tree_root, path);
3570
3571 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3572 key.type != BTRFS_ROOT_ITEM_KEY)
3573 break;
3574
3575 reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3576 if (IS_ERR(reloc_root)) {
3577 err = PTR_ERR(reloc_root);
3578 goto out;
3579 }
3580
3581 list_add(&reloc_root->root_list, &reloc_roots);
3582
3583 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3584 fs_root = read_fs_root(root->fs_info,
3585 reloc_root->root_key.offset);
3586 if (IS_ERR(fs_root)) {
3587 err = PTR_ERR(fs_root);
3588 goto out;
3589 }
3590 }
3591
3592 if (key.offset == 0)
3593 break;
3594
3595 key.offset--;
3596 }
3597 btrfs_release_path(root->fs_info->tree_root, path);
3598
3599 if (list_empty(&reloc_roots))
3600 goto out;
3601
3602 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3603 if (!rc) {
3604 err = -ENOMEM;
3605 goto out;
3606 }
3607
3608 mapping_tree_init(&rc->reloc_root_tree);
3609 INIT_LIST_HEAD(&rc->reloc_roots);
3610 btrfs_init_workers(&rc->workers, "relocate",
3611 root->fs_info->thread_pool_size);
3612 rc->extent_root = root->fs_info->extent_root;
3613
3614 set_reloc_control(rc);
3615
3616 while (!list_empty(&reloc_roots)) {
3617 reloc_root = list_entry(reloc_roots.next,
3618 struct btrfs_root, root_list);
3619 list_del(&reloc_root->root_list);
3620
3621 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3622 list_add_tail(&reloc_root->root_list,
3623 &rc->reloc_roots);
3624 continue;
3625 }
3626
3627 fs_root = read_fs_root(root->fs_info,
3628 reloc_root->root_key.offset);
3629 BUG_ON(IS_ERR(fs_root));
3630
3631 __add_reloc_root(reloc_root);
3632 fs_root->reloc_root = reloc_root;
3633 }
3634
3635 trans = btrfs_start_transaction(rc->extent_root, 1);
3636 btrfs_commit_transaction(trans, rc->extent_root);
3637
3638 merge_reloc_roots(rc);
3639
3640 unset_reloc_control(rc);
3641
3642 trans = btrfs_start_transaction(rc->extent_root, 1);
3643 btrfs_commit_transaction(trans, rc->extent_root);
3644out:
3645 if (rc) {
3646 btrfs_stop_workers(&rc->workers);
3647 kfree(rc);
3648 }
3649 while (!list_empty(&reloc_roots)) {
3650 reloc_root = list_entry(reloc_roots.next,
3651 struct btrfs_root, root_list);
3652 list_del(&reloc_root->root_list);
3653 free_extent_buffer(reloc_root->node);
3654 free_extent_buffer(reloc_root->commit_root);
3655 kfree(reloc_root);
3656 }
3657 btrfs_free_path(path);
3658
3659 if (err == 0) {
3660 /* cleanup orphan inode in data relocation tree */
3661 fs_root = read_fs_root(root->fs_info,
3662 BTRFS_DATA_RELOC_TREE_OBJECTID);
3663 if (IS_ERR(fs_root))
3664 err = PTR_ERR(fs_root);
3665 }
3666 return err;
3667}
3668
3669/*
3670 * helper to add ordered checksum for data relocation.
3671 *
3672 * cloning checksum properly handles the nodatasum extents.
3673 * it also saves CPU time to re-calculate the checksum.
3674 */
3675int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3676{
3677 struct btrfs_ordered_sum *sums;
3678 struct btrfs_sector_sum *sector_sum;
3679 struct btrfs_ordered_extent *ordered;
3680 struct btrfs_root *root = BTRFS_I(inode)->root;
3681 size_t offset;
3682 int ret;
3683 u64 disk_bytenr;
3684 LIST_HEAD(list);
3685
3686 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3687 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3688
3689 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3690 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3691 disk_bytenr + len - 1, &list);
3692
3693 while (!list_empty(&list)) {
3694 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3695 list_del_init(&sums->list);
3696
3697 sector_sum = sums->sums;
3698 sums->bytenr = ordered->start;
3699
3700 offset = 0;
3701 while (offset < sums->len) {
3702 sector_sum->bytenr += ordered->start - disk_bytenr;
3703 sector_sum++;
3704 offset += root->sectorsize;
3705 }
3706
3707 btrfs_add_ordered_sum(inode, ordered, sums);
3708 }
3709 btrfs_put_ordered_extent(ordered);
3710 return 0;
3711}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index b48650de4472..0ddc6d61c55a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -111,6 +111,15 @@ out:
111 return ret; 111 return ret;
112} 112}
113 113
114int btrfs_set_root_node(struct btrfs_root_item *item,
115 struct extent_buffer *node)
116{
117 btrfs_set_root_bytenr(item, node->start);
118 btrfs_set_root_level(item, btrfs_header_level(node));
119 btrfs_set_root_generation(item, btrfs_header_generation(node));
120 return 0;
121}
122
114/* 123/*
115 * copy the data in 'item' into the btree 124 * copy the data in 'item' into the btree
116 */ 125 */
@@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
164 * offset lower than the latest root. They need to be queued for deletion to 173 * offset lower than the latest root. They need to be queued for deletion to
165 * finish what was happening when we crashed. 174 * finish what was happening when we crashed.
166 */ 175 */
167int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, 176int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
168 struct btrfs_root *latest)
169{ 177{
170 struct btrfs_root *dead_root; 178 struct btrfs_root *dead_root;
171 struct btrfs_item *item; 179 struct btrfs_item *item;
@@ -227,10 +235,7 @@ again:
227 goto err; 235 goto err;
228 } 236 }
229 237
230 if (objectid == BTRFS_TREE_RELOC_OBJECTID) 238 ret = btrfs_add_dead_root(dead_root);
231 ret = btrfs_add_dead_reloc_root(dead_root);
232 else
233 ret = btrfs_add_dead_root(dead_root, latest);
234 if (ret) 239 if (ret)
235 goto err; 240 goto err;
236 goto again; 241 goto again;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2ff7cd2db25f..9f179d4832d5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,7 +52,6 @@
52#include "export.h" 52#include "export.h"
53#include "compression.h" 53#include "compression.h"
54 54
55
56static struct super_operations btrfs_super_ops; 55static struct super_operations btrfs_super_ops;
57 56
58static void btrfs_put_super(struct super_block *sb) 57static void btrfs_put_super(struct super_block *sb)
@@ -67,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb)
67enum { 66enum {
68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 67 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 68 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, 69 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
71 Opt_ratio, Opt_flushoncommit, Opt_err, 70 Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
72}; 71};
73 72
74static match_table_t tokens = { 73static match_table_t tokens = {
@@ -84,6 +83,8 @@ static match_table_t tokens = {
84 {Opt_thread_pool, "thread_pool=%d"}, 83 {Opt_thread_pool, "thread_pool=%d"},
85 {Opt_compress, "compress"}, 84 {Opt_compress, "compress"},
86 {Opt_ssd, "ssd"}, 85 {Opt_ssd, "ssd"},
86 {Opt_ssd_spread, "ssd_spread"},
87 {Opt_nossd, "nossd"},
87 {Opt_noacl, "noacl"}, 88 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"}, 89 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"}, 90 {Opt_flushoncommit, "flushoncommit"},
@@ -158,7 +159,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
158 */ 159 */
159 break; 160 break;
160 case Opt_nodatasum: 161 case Opt_nodatasum:
161 printk(KERN_INFO "btrfs: setting nodatacsum\n"); 162 printk(KERN_INFO "btrfs: setting nodatasum\n");
162 btrfs_set_opt(info->mount_opt, NODATASUM); 163 btrfs_set_opt(info->mount_opt, NODATASUM);
163 break; 164 break;
164 case Opt_nodatacow: 165 case Opt_nodatacow:
@@ -174,6 +175,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
174 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 175 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
175 btrfs_set_opt(info->mount_opt, SSD); 176 btrfs_set_opt(info->mount_opt, SSD);
176 break; 177 break;
178 case Opt_ssd_spread:
179 printk(KERN_INFO "btrfs: use spread ssd "
180 "allocation scheme\n");
181 btrfs_set_opt(info->mount_opt, SSD);
182 btrfs_set_opt(info->mount_opt, SSD_SPREAD);
183 break;
184 case Opt_nossd:
185 printk(KERN_INFO "btrfs: not using ssd allocation "
186 "scheme\n");
187 btrfs_set_opt(info->mount_opt, NOSSD);
188 btrfs_clear_opt(info->mount_opt, SSD);
189 btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
190 break;
177 case Opt_nobarrier: 191 case Opt_nobarrier:
178 printk(KERN_INFO "btrfs: turning off barriers\n"); 192 printk(KERN_INFO "btrfs: turning off barriers\n");
179 btrfs_set_opt(info->mount_opt, NOBARRIER); 193 btrfs_set_opt(info->mount_opt, NOBARRIER);
@@ -322,7 +336,7 @@ static int btrfs_fill_super(struct super_block *sb,
322 struct dentry *root_dentry; 336 struct dentry *root_dentry;
323 struct btrfs_super_block *disk_super; 337 struct btrfs_super_block *disk_super;
324 struct btrfs_root *tree_root; 338 struct btrfs_root *tree_root;
325 struct btrfs_inode *bi; 339 struct btrfs_key key;
326 int err; 340 int err;
327 341
328 sb->s_maxbytes = MAX_LFS_FILESIZE; 342 sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -341,23 +355,15 @@ static int btrfs_fill_super(struct super_block *sb,
341 } 355 }
342 sb->s_fs_info = tree_root; 356 sb->s_fs_info = tree_root;
343 disk_super = &tree_root->fs_info->super_copy; 357 disk_super = &tree_root->fs_info->super_copy;
344 inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
345 tree_root->fs_info->fs_root);
346 bi = BTRFS_I(inode);
347 bi->location.objectid = inode->i_ino;
348 bi->location.offset = 0;
349 bi->root = tree_root->fs_info->fs_root;
350
351 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
352 358
353 if (!inode) { 359 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
354 err = -ENOMEM; 360 key.type = BTRFS_INODE_ITEM_KEY;
361 key.offset = 0;
362 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
363 if (IS_ERR(inode)) {
364 err = PTR_ERR(inode);
355 goto fail_close; 365 goto fail_close;
356 } 366 }
357 if (inode->i_state & I_NEW) {
358 btrfs_read_locked_inode(inode);
359 unlock_new_inode(inode);
360 }
361 367
362 root_dentry = d_alloc_root(inode); 368 root_dentry = d_alloc_root(inode);
363 if (!root_dentry) { 369 if (!root_dentry) {
@@ -388,10 +394,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
388 struct btrfs_root *root = btrfs_sb(sb); 394 struct btrfs_root *root = btrfs_sb(sb);
389 int ret; 395 int ret;
390 396
391 if (sb->s_flags & MS_RDONLY)
392 return 0;
393
394 sb->s_dirt = 0;
395 if (!wait) { 397 if (!wait) {
396 filemap_flush(root->fs_info->btree_inode->i_mapping); 398 filemap_flush(root->fs_info->btree_inode->i_mapping);
397 return 0; 399 return 0;
@@ -402,7 +404,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
402 404
403 trans = btrfs_start_transaction(root, 1); 405 trans = btrfs_start_transaction(root, 1);
404 ret = btrfs_commit_transaction(trans, root); 406 ret = btrfs_commit_transaction(trans, root);
405 sb->s_dirt = 0;
406 return ret; 407 return ret;
407} 408}
408 409
@@ -433,7 +434,11 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
433 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); 434 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
434 if (btrfs_test_opt(root, COMPRESS)) 435 if (btrfs_test_opt(root, COMPRESS))
435 seq_puts(seq, ",compress"); 436 seq_puts(seq, ",compress");
436 if (btrfs_test_opt(root, SSD)) 437 if (btrfs_test_opt(root, NOSSD))
438 seq_puts(seq, ",nossd");
439 if (btrfs_test_opt(root, SSD_SPREAD))
440 seq_puts(seq, ",ssd_spread");
441 else if (btrfs_test_opt(root, SSD))
437 seq_puts(seq, ",ssd"); 442 seq_puts(seq, ",ssd");
438 if (btrfs_test_opt(root, NOTREELOG)) 443 if (btrfs_test_opt(root, NOTREELOG))
439 seq_puts(seq, ",notreelog"); 444 seq_puts(seq, ",notreelog");
@@ -444,11 +449,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
444 return 0; 449 return 0;
445} 450}
446 451
447static void btrfs_write_super(struct super_block *sb)
448{
449 sb->s_dirt = 0;
450}
451
452static int btrfs_test_super(struct super_block *s, void *data) 452static int btrfs_test_super(struct super_block *s, void *data)
453{ 453{
454 struct btrfs_fs_devices *test_fs_devices = data; 454 struct btrfs_fs_devices *test_fs_devices = data;
@@ -584,7 +584,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
584 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 584 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
585 return -EINVAL; 585 return -EINVAL;
586 586
587 ret = btrfs_cleanup_reloc_trees(root); 587 /* recover relocation */
588 ret = btrfs_recover_relocation(root);
588 WARN_ON(ret); 589 WARN_ON(ret);
589 590
590 ret = btrfs_cleanup_fs_roots(root->fs_info); 591 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -678,7 +679,6 @@ static int btrfs_unfreeze(struct super_block *sb)
678static struct super_operations btrfs_super_ops = { 679static struct super_operations btrfs_super_ops = {
679 .delete_inode = btrfs_delete_inode, 680 .delete_inode = btrfs_delete_inode,
680 .put_super = btrfs_put_super, 681 .put_super = btrfs_put_super,
681 .write_super = btrfs_write_super,
682 .sync_fs = btrfs_sync_fs, 682 .sync_fs = btrfs_sync_fs,
683 .show_options = btrfs_show_options, 683 .show_options = btrfs_show_options,
684 .write_inode = btrfs_write_inode, 684 .write_inode = btrfs_write_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01b143605ec1..2e177d7f4bb9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,7 +25,6 @@
25#include "disk-io.h" 25#include "disk-io.h"
26#include "transaction.h" 26#include "transaction.h"
27#include "locking.h" 27#include "locking.h"
28#include "ref-cache.h"
29#include "tree-log.h" 28#include "tree-log.h"
30 29
31#define BTRFS_ROOT_TRANS_TAG 0 30#define BTRFS_ROOT_TRANS_TAG 0
@@ -94,45 +93,37 @@ static noinline int join_transaction(struct btrfs_root *root)
94 * to make sure the old root from before we joined the transaction is deleted 93 * to make sure the old root from before we joined the transaction is deleted
95 * when the transaction commits 94 * when the transaction commits
96 */ 95 */
97noinline int btrfs_record_root_in_trans(struct btrfs_root *root) 96static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root)
98{ 98{
99 struct btrfs_dirty_root *dirty; 99 if (root->ref_cows && root->last_trans < trans->transid) {
100 u64 running_trans_id = root->fs_info->running_transaction->transid;
101 if (root->ref_cows && root->last_trans < running_trans_id) {
102 WARN_ON(root == root->fs_info->extent_root); 100 WARN_ON(root == root->fs_info->extent_root);
103 if (root->root_item.refs != 0) { 101 WARN_ON(root->root_item.refs == 0);
104 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 102 WARN_ON(root->commit_root != root->node);
105 (unsigned long)root->root_key.objectid, 103
106 BTRFS_ROOT_TRANS_TAG); 104 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
107 105 (unsigned long)root->root_key.objectid,
108 dirty = kmalloc(sizeof(*dirty), GFP_NOFS); 106 BTRFS_ROOT_TRANS_TAG);
109 BUG_ON(!dirty); 107 root->last_trans = trans->transid;
110 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); 108 btrfs_init_reloc_root(trans, root);
111 BUG_ON(!dirty->root); 109 }
112 dirty->latest_root = root; 110 return 0;
113 INIT_LIST_HEAD(&dirty->list); 111}
114
115 root->commit_root = btrfs_root_node(root);
116
117 memcpy(dirty->root, root, sizeof(*root));
118 spin_lock_init(&dirty->root->node_lock);
119 spin_lock_init(&dirty->root->list_lock);
120 mutex_init(&dirty->root->objectid_mutex);
121 mutex_init(&dirty->root->log_mutex);
122 INIT_LIST_HEAD(&dirty->root->dead_list);
123 dirty->root->node = root->commit_root;
124 dirty->root->commit_root = NULL;
125 112
126 spin_lock(&root->list_lock); 113int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
127 list_add(&dirty->root->dead_list, &root->dead_list); 114 struct btrfs_root *root)
128 spin_unlock(&root->list_lock); 115{
116 if (!root->ref_cows)
117 return 0;
129 118
130 root->dirty_root = dirty; 119 mutex_lock(&root->fs_info->trans_mutex);
131 } else { 120 if (root->last_trans == trans->transid) {
132 WARN_ON(1); 121 mutex_unlock(&root->fs_info->trans_mutex);
133 } 122 return 0;
134 root->last_trans = running_trans_id;
135 } 123 }
124
125 record_root_in_trans(trans, root);
126 mutex_unlock(&root->fs_info->trans_mutex);
136 return 0; 127 return 0;
137} 128}
138 129
@@ -181,7 +172,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
181 ret = join_transaction(root); 172 ret = join_transaction(root);
182 BUG_ON(ret); 173 BUG_ON(ret);
183 174
184 btrfs_record_root_in_trans(root);
185 h->transid = root->fs_info->running_transaction->transid; 175 h->transid = root->fs_info->running_transaction->transid;
186 h->transaction = root->fs_info->running_transaction; 176 h->transaction = root->fs_info->running_transaction;
187 h->blocks_reserved = num_blocks; 177 h->blocks_reserved = num_blocks;
@@ -192,6 +182,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
192 h->delayed_ref_updates = 0; 182 h->delayed_ref_updates = 0;
193 183
194 root->fs_info->running_transaction->use_count++; 184 root->fs_info->running_transaction->use_count++;
185 record_root_in_trans(h, root);
195 mutex_unlock(&root->fs_info->trans_mutex); 186 mutex_unlock(&root->fs_info->trans_mutex);
196 return h; 187 return h;
197} 188}
@@ -233,6 +224,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
233 return 0; 224 return 0;
234} 225}
235 226
227#if 0
236/* 228/*
237 * rate limit against the drop_snapshot code. This helps to slow down new 229 * rate limit against the drop_snapshot code. This helps to slow down new
238 * operations if the drop_snapshot code isn't able to keep up. 230 * operations if the drop_snapshot code isn't able to keep up.
@@ -273,6 +265,7 @@ harder:
273 goto harder; 265 goto harder;
274 } 266 }
275} 267}
268#endif
276 269
277void btrfs_throttle(struct btrfs_root *root) 270void btrfs_throttle(struct btrfs_root *root)
278{ 271{
@@ -280,7 +273,6 @@ void btrfs_throttle(struct btrfs_root *root)
280 if (!root->fs_info->open_ioctl_trans) 273 if (!root->fs_info->open_ioctl_trans)
281 wait_current_trans(root); 274 wait_current_trans(root);
282 mutex_unlock(&root->fs_info->trans_mutex); 275 mutex_unlock(&root->fs_info->trans_mutex);
283 throttle_on_drops(root);
284} 276}
285 277
286static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 278static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -323,9 +315,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
323 memset(trans, 0, sizeof(*trans)); 315 memset(trans, 0, sizeof(*trans));
324 kmem_cache_free(btrfs_trans_handle_cachep, trans); 316 kmem_cache_free(btrfs_trans_handle_cachep, trans);
325 317
326 if (throttle)
327 throttle_on_drops(root);
328
329 return 0; 318 return 0;
330} 319}
331 320
@@ -462,12 +451,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
462 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 451 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
463 if (old_root_bytenr == root->node->start) 452 if (old_root_bytenr == root->node->start)
464 break; 453 break;
465 btrfs_set_root_bytenr(&root->root_item,
466 root->node->start);
467 btrfs_set_root_level(&root->root_item,
468 btrfs_header_level(root->node));
469 btrfs_set_root_generation(&root->root_item, trans->transid);
470 454
455 btrfs_set_root_node(&root->root_item, root->node);
471 ret = btrfs_update_root(trans, tree_root, 456 ret = btrfs_update_root(trans, tree_root,
472 &root->root_key, 457 &root->root_key,
473 &root->root_item); 458 &root->root_item);
@@ -477,14 +462,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
477 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 462 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
478 BUG_ON(ret); 463 BUG_ON(ret);
479 } 464 }
465 free_extent_buffer(root->commit_root);
466 root->commit_root = btrfs_root_node(root);
480 return 0; 467 return 0;
481} 468}
482 469
483/* 470/*
484 * update all the cowonly tree roots on disk 471 * update all the cowonly tree roots on disk
485 */ 472 */
486int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 473static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
487 struct btrfs_root *root) 474 struct btrfs_root *root)
488{ 475{
489 struct btrfs_fs_info *fs_info = root->fs_info; 476 struct btrfs_fs_info *fs_info = root->fs_info;
490 struct list_head *next; 477 struct list_head *next;
@@ -520,118 +507,54 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
520 * a dirty root struct and adds it into the list of dead roots that need to 507 * a dirty root struct and adds it into the list of dead roots that need to
521 * be deleted 508 * be deleted
522 */ 509 */
523int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) 510int btrfs_add_dead_root(struct btrfs_root *root)
524{ 511{
525 struct btrfs_dirty_root *dirty;
526
527 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
528 if (!dirty)
529 return -ENOMEM;
530 dirty->root = root;
531 dirty->latest_root = latest;
532
533 mutex_lock(&root->fs_info->trans_mutex); 512 mutex_lock(&root->fs_info->trans_mutex);
534 list_add(&dirty->list, &latest->fs_info->dead_roots); 513 list_add(&root->root_list, &root->fs_info->dead_roots);
535 mutex_unlock(&root->fs_info->trans_mutex); 514 mutex_unlock(&root->fs_info->trans_mutex);
536 return 0; 515 return 0;
537} 516}
538 517
539/* 518/*
540 * at transaction commit time we need to schedule the old roots for 519 * update all the cowonly tree roots on disk
541 * deletion via btrfs_drop_snapshot. This runs through all the
542 * reference counted roots that were modified in the current
543 * transaction and puts them into the drop list
544 */ 520 */
545static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, 521static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
546 struct radix_tree_root *radix, 522 struct btrfs_root *root)
547 struct list_head *list)
548{ 523{
549 struct btrfs_dirty_root *dirty;
550 struct btrfs_root *gang[8]; 524 struct btrfs_root *gang[8];
551 struct btrfs_root *root; 525 struct btrfs_fs_info *fs_info = root->fs_info;
552 int i; 526 int i;
553 int ret; 527 int ret;
554 int err = 0; 528 int err = 0;
555 u32 refs;
556 529
557 while (1) { 530 while (1) {
558 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, 531 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
532 (void **)gang, 0,
559 ARRAY_SIZE(gang), 533 ARRAY_SIZE(gang),
560 BTRFS_ROOT_TRANS_TAG); 534 BTRFS_ROOT_TRANS_TAG);
561 if (ret == 0) 535 if (ret == 0)
562 break; 536 break;
563 for (i = 0; i < ret; i++) { 537 for (i = 0; i < ret; i++) {
564 root = gang[i]; 538 root = gang[i];
565 radix_tree_tag_clear(radix, 539 radix_tree_tag_clear(&fs_info->fs_roots_radix,
566 (unsigned long)root->root_key.objectid, 540 (unsigned long)root->root_key.objectid,
567 BTRFS_ROOT_TRANS_TAG); 541 BTRFS_ROOT_TRANS_TAG);
568
569 BUG_ON(!root->ref_tree);
570 dirty = root->dirty_root;
571 542
572 btrfs_free_log(trans, root); 543 btrfs_free_log(trans, root);
573 btrfs_free_reloc_root(trans, root); 544 btrfs_update_reloc_root(trans, root);
574
575 if (root->commit_root == root->node) {
576 WARN_ON(root->node->start !=
577 btrfs_root_bytenr(&root->root_item));
578
579 free_extent_buffer(root->commit_root);
580 root->commit_root = NULL;
581 root->dirty_root = NULL;
582
583 spin_lock(&root->list_lock);
584 list_del_init(&dirty->root->dead_list);
585 spin_unlock(&root->list_lock);
586 545
587 kfree(dirty->root); 546 if (root->commit_root == root->node)
588 kfree(dirty);
589
590 /* make sure to update the root on disk
591 * so we get any updates to the block used
592 * counts
593 */
594 err = btrfs_update_root(trans,
595 root->fs_info->tree_root,
596 &root->root_key,
597 &root->root_item);
598 continue; 547 continue;
599 }
600 548
601 memset(&root->root_item.drop_progress, 0, 549 free_extent_buffer(root->commit_root);
602 sizeof(struct btrfs_disk_key)); 550 root->commit_root = btrfs_root_node(root);
603 root->root_item.drop_level = 0; 551
604 root->commit_root = NULL; 552 btrfs_set_root_node(&root->root_item, root->node);
605 root->dirty_root = NULL; 553 err = btrfs_update_root(trans, fs_info->tree_root,
606 root->root_key.offset = root->fs_info->generation;
607 btrfs_set_root_bytenr(&root->root_item,
608 root->node->start);
609 btrfs_set_root_level(&root->root_item,
610 btrfs_header_level(root->node));
611 btrfs_set_root_generation(&root->root_item,
612 root->root_key.offset);
613
614 err = btrfs_insert_root(trans, root->fs_info->tree_root,
615 &root->root_key, 554 &root->root_key,
616 &root->root_item); 555 &root->root_item);
617 if (err) 556 if (err)
618 break; 557 break;
619
620 refs = btrfs_root_refs(&dirty->root->root_item);
621 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
622 err = btrfs_update_root(trans, root->fs_info->tree_root,
623 &dirty->root->root_key,
624 &dirty->root->root_item);
625
626 BUG_ON(err);
627 if (refs == 1) {
628 list_add(&dirty->list, list);
629 } else {
630 WARN_ON(1);
631 free_extent_buffer(dirty->root->node);
632 kfree(dirty->root);
633 kfree(dirty);
634 }
635 } 558 }
636 } 559 }
637 return err; 560 return err;
@@ -688,12 +611,8 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
688 TASK_UNINTERRUPTIBLE); 611 TASK_UNINTERRUPTIBLE);
689 mutex_unlock(&info->trans_mutex); 612 mutex_unlock(&info->trans_mutex);
690 613
691 atomic_dec(&info->throttles);
692 wake_up(&info->transaction_throttle);
693
694 schedule(); 614 schedule();
695 615
696 atomic_inc(&info->throttles);
697 mutex_lock(&info->trans_mutex); 616 mutex_lock(&info->trans_mutex);
698 finish_wait(&info->transaction_wait, &wait); 617 finish_wait(&info->transaction_wait, &wait);
699 } 618 }
@@ -705,111 +624,61 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
705 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 624 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
706 * all of them 625 * all of them
707 */ 626 */
708static noinline int drop_dirty_roots(struct btrfs_root *tree_root, 627int btrfs_drop_dead_root(struct btrfs_root *root)
709 struct list_head *list)
710{ 628{
711 struct btrfs_dirty_root *dirty;
712 struct btrfs_trans_handle *trans; 629 struct btrfs_trans_handle *trans;
630 struct btrfs_root *tree_root = root->fs_info->tree_root;
713 unsigned long nr; 631 unsigned long nr;
714 u64 num_bytes; 632 int ret;
715 u64 bytes_used;
716 u64 max_useless;
717 int ret = 0;
718 int err;
719
720 while (!list_empty(list)) {
721 struct btrfs_root *root;
722
723 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
724 list_del_init(&dirty->list);
725
726 num_bytes = btrfs_root_used(&dirty->root->root_item);
727 root = dirty->latest_root;
728 atomic_inc(&root->fs_info->throttles);
729
730 while (1) {
731 /*
732 * we don't want to jump in and create a bunch of
733 * delayed refs if the transaction is starting to close
734 */
735 wait_transaction_pre_flush(tree_root->fs_info);
736 trans = btrfs_start_transaction(tree_root, 1);
737
738 /*
739 * we've joined a transaction, make sure it isn't
740 * closing right now
741 */
742 if (trans->transaction->delayed_refs.flushing) {
743 btrfs_end_transaction(trans, tree_root);
744 continue;
745 }
746
747 mutex_lock(&root->fs_info->drop_mutex);
748 ret = btrfs_drop_snapshot(trans, dirty->root);
749 if (ret != -EAGAIN)
750 break;
751 mutex_unlock(&root->fs_info->drop_mutex);
752 633
753 err = btrfs_update_root(trans, 634 while (1) {
754 tree_root, 635 /*
755 &dirty->root->root_key, 636 * we don't want to jump in and create a bunch of
756 &dirty->root->root_item); 637 * delayed refs if the transaction is starting to close
757 if (err) 638 */
758 ret = err; 639 wait_transaction_pre_flush(tree_root->fs_info);
759 nr = trans->blocks_used; 640 trans = btrfs_start_transaction(tree_root, 1);
760 ret = btrfs_end_transaction(trans, tree_root);
761 BUG_ON(ret);
762 641
763 btrfs_btree_balance_dirty(tree_root, nr); 642 /*
764 cond_resched(); 643 * we've joined a transaction, make sure it isn't
644 * closing right now
645 */
646 if (trans->transaction->delayed_refs.flushing) {
647 btrfs_end_transaction(trans, tree_root);
648 continue;
765 } 649 }
766 BUG_ON(ret);
767 atomic_dec(&root->fs_info->throttles);
768 wake_up(&root->fs_info->transaction_throttle);
769 650
770 num_bytes -= btrfs_root_used(&dirty->root->root_item); 651 ret = btrfs_drop_snapshot(trans, root);
771 bytes_used = btrfs_root_used(&root->root_item); 652 if (ret != -EAGAIN)
772 if (num_bytes) { 653 break;
773 mutex_lock(&root->fs_info->trans_mutex);
774 btrfs_record_root_in_trans(root);
775 mutex_unlock(&root->fs_info->trans_mutex);
776 btrfs_set_root_used(&root->root_item,
777 bytes_used - num_bytes);
778 }
779 654
780 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); 655 ret = btrfs_update_root(trans, tree_root,
781 if (ret) { 656 &root->root_key,
782 BUG(); 657 &root->root_item);
658 if (ret)
783 break; 659 break;
784 }
785 mutex_unlock(&root->fs_info->drop_mutex);
786
787 spin_lock(&root->list_lock);
788 list_del_init(&dirty->root->dead_list);
789 if (!list_empty(&root->dead_list)) {
790 struct btrfs_root *oldest;
791 oldest = list_entry(root->dead_list.prev,
792 struct btrfs_root, dead_list);
793 max_useless = oldest->root_key.offset - 1;
794 } else {
795 max_useless = root->root_key.offset - 1;
796 }
797 spin_unlock(&root->list_lock);
798 660
799 nr = trans->blocks_used; 661 nr = trans->blocks_used;
800 ret = btrfs_end_transaction(trans, tree_root); 662 ret = btrfs_end_transaction(trans, tree_root);
801 BUG_ON(ret); 663 BUG_ON(ret);
802 664
803 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
804 BUG_ON(ret);
805
806 free_extent_buffer(dirty->root->node);
807 kfree(dirty->root);
808 kfree(dirty);
809
810 btrfs_btree_balance_dirty(tree_root, nr); 665 btrfs_btree_balance_dirty(tree_root, nr);
811 cond_resched(); 666 cond_resched();
812 } 667 }
668 BUG_ON(ret);
669
670 ret = btrfs_del_root(trans, tree_root, &root->root_key);
671 BUG_ON(ret);
672
673 nr = trans->blocks_used;
674 ret = btrfs_end_transaction(trans, tree_root);
675 BUG_ON(ret);
676
677 free_extent_buffer(root->node);
678 free_extent_buffer(root->commit_root);
679 kfree(root);
680
681 btrfs_btree_balance_dirty(tree_root, nr);
813 return ret; 682 return ret;
814} 683}
815 684
@@ -839,24 +708,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
839 if (ret) 708 if (ret)
840 goto fail; 709 goto fail;
841 710
842 btrfs_record_root_in_trans(root); 711 record_root_in_trans(trans, root);
843 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 712 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
844 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 713 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
845 714
846 key.objectid = objectid; 715 key.objectid = objectid;
847 key.offset = trans->transid; 716 key.offset = 0;
848 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 717 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
849 718
850 old = btrfs_lock_root_node(root); 719 old = btrfs_lock_root_node(root);
851 btrfs_cow_block(trans, root, old, NULL, 0, &old); 720 btrfs_cow_block(trans, root, old, NULL, 0, &old);
721 btrfs_set_lock_blocking(old);
852 722
853 btrfs_copy_root(trans, root, old, &tmp, objectid); 723 btrfs_copy_root(trans, root, old, &tmp, objectid);
854 btrfs_tree_unlock(old); 724 btrfs_tree_unlock(old);
855 free_extent_buffer(old); 725 free_extent_buffer(old);
856 726
857 btrfs_set_root_bytenr(new_root_item, tmp->start); 727 btrfs_set_root_node(new_root_item, tmp);
858 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
859 btrfs_set_root_generation(new_root_item, trans->transid);
860 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 728 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
861 new_root_item); 729 new_root_item);
862 btrfs_tree_unlock(tmp); 730 btrfs_tree_unlock(tmp);
@@ -964,6 +832,24 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
964 return 0; 832 return 0;
965} 833}
966 834
835static void update_super_roots(struct btrfs_root *root)
836{
837 struct btrfs_root_item *root_item;
838 struct btrfs_super_block *super;
839
840 super = &root->fs_info->super_copy;
841
842 root_item = &root->fs_info->chunk_root->root_item;
843 super->chunk_root = root_item->bytenr;
844 super->chunk_root_generation = root_item->generation;
845 super->chunk_root_level = root_item->level;
846
847 root_item = &root->fs_info->tree_root->root_item;
848 super->root = root_item->bytenr;
849 super->generation = root_item->generation;
850 super->root_level = root_item->level;
851}
852
967int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 853int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
968 struct btrfs_root *root) 854 struct btrfs_root *root)
969{ 855{
@@ -971,8 +857,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
971 unsigned long timeout = 1; 857 unsigned long timeout = 1;
972 struct btrfs_transaction *cur_trans; 858 struct btrfs_transaction *cur_trans;
973 struct btrfs_transaction *prev_trans = NULL; 859 struct btrfs_transaction *prev_trans = NULL;
974 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
975 struct list_head dirty_fs_roots;
976 struct extent_io_tree *pinned_copy; 860 struct extent_io_tree *pinned_copy;
977 DEFINE_WAIT(wait); 861 DEFINE_WAIT(wait);
978 int ret; 862 int ret;
@@ -999,7 +883,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
999 BUG_ON(ret); 883 BUG_ON(ret);
1000 884
1001 mutex_lock(&root->fs_info->trans_mutex); 885 mutex_lock(&root->fs_info->trans_mutex);
1002 INIT_LIST_HEAD(&dirty_fs_roots);
1003 if (cur_trans->in_commit) { 886 if (cur_trans->in_commit) {
1004 cur_trans->use_count++; 887 cur_trans->use_count++;
1005 mutex_unlock(&root->fs_info->trans_mutex); 888 mutex_unlock(&root->fs_info->trans_mutex);
@@ -1105,41 +988,36 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1105 * with the tree-log code. 988 * with the tree-log code.
1106 */ 989 */
1107 mutex_lock(&root->fs_info->tree_log_mutex); 990 mutex_lock(&root->fs_info->tree_log_mutex);
1108 /*
1109 * keep tree reloc code from adding new reloc trees
1110 */
1111 mutex_lock(&root->fs_info->tree_reloc_mutex);
1112
1113 991
1114 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, 992 ret = commit_fs_roots(trans, root);
1115 &dirty_fs_roots);
1116 BUG_ON(ret); 993 BUG_ON(ret);
1117 994
1118 /* add_dirty_roots gets rid of all the tree log roots, it is now 995 /* commit_fs_roots gets rid of all the tree log roots, it is now
1119 * safe to free the root of tree log roots 996 * safe to free the root of tree log roots
1120 */ 997 */
1121 btrfs_free_log_root_tree(trans, root->fs_info); 998 btrfs_free_log_root_tree(trans, root->fs_info);
1122 999
1123 ret = btrfs_commit_tree_roots(trans, root); 1000 ret = commit_cowonly_roots(trans, root);
1124 BUG_ON(ret); 1001 BUG_ON(ret);
1125 1002
1126 cur_trans = root->fs_info->running_transaction; 1003 cur_trans = root->fs_info->running_transaction;
1127 spin_lock(&root->fs_info->new_trans_lock); 1004 spin_lock(&root->fs_info->new_trans_lock);
1128 root->fs_info->running_transaction = NULL; 1005 root->fs_info->running_transaction = NULL;
1129 spin_unlock(&root->fs_info->new_trans_lock); 1006 spin_unlock(&root->fs_info->new_trans_lock);
1130 btrfs_set_super_generation(&root->fs_info->super_copy, 1007
1131 cur_trans->transid); 1008 btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1132 btrfs_set_super_root(&root->fs_info->super_copy, 1009 root->fs_info->tree_root->node);
1133 root->fs_info->tree_root->node->start); 1010 free_extent_buffer(root->fs_info->tree_root->commit_root);
1134 btrfs_set_super_root_level(&root->fs_info->super_copy, 1011 root->fs_info->tree_root->commit_root =
1135 btrfs_header_level(root->fs_info->tree_root->node)); 1012 btrfs_root_node(root->fs_info->tree_root);
1136 1013
1137 btrfs_set_super_chunk_root(&root->fs_info->super_copy, 1014 btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1138 chunk_root->node->start); 1015 root->fs_info->chunk_root->node);
1139 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, 1016 free_extent_buffer(root->fs_info->chunk_root->commit_root);
1140 btrfs_header_level(chunk_root->node)); 1017 root->fs_info->chunk_root->commit_root =
1141 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, 1018 btrfs_root_node(root->fs_info->chunk_root);
1142 btrfs_header_generation(chunk_root->node)); 1019
1020 update_super_roots(root);
1143 1021
1144 if (!root->fs_info->log_root_recovering) { 1022 if (!root->fs_info->log_root_recovering) {
1145 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1023 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
@@ -1153,7 +1031,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1153 1031
1154 trans->transaction->blocked = 0; 1032 trans->transaction->blocked = 0;
1155 1033
1156 wake_up(&root->fs_info->transaction_throttle);
1157 wake_up(&root->fs_info->transaction_wait); 1034 wake_up(&root->fs_info->transaction_wait);
1158 1035
1159 mutex_unlock(&root->fs_info->trans_mutex); 1036 mutex_unlock(&root->fs_info->trans_mutex);
@@ -1170,9 +1047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1170 btrfs_finish_extent_commit(trans, root, pinned_copy); 1047 btrfs_finish_extent_commit(trans, root, pinned_copy);
1171 kfree(pinned_copy); 1048 kfree(pinned_copy);
1172 1049
1173 btrfs_drop_dead_reloc_roots(root);
1174 mutex_unlock(&root->fs_info->tree_reloc_mutex);
1175
1176 /* do the directory inserts of any pending snapshot creations */ 1050 /* do the directory inserts of any pending snapshot creations */
1177 finish_pending_snapshots(trans, root->fs_info); 1051 finish_pending_snapshots(trans, root->fs_info);
1178 1052
@@ -1186,16 +1060,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1186 put_transaction(cur_trans); 1060 put_transaction(cur_trans);
1187 put_transaction(cur_trans); 1061 put_transaction(cur_trans);
1188 1062
1189 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1190 if (root->fs_info->closing)
1191 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1192
1193 mutex_unlock(&root->fs_info->trans_mutex); 1063 mutex_unlock(&root->fs_info->trans_mutex);
1194 1064
1195 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1065 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1196
1197 if (root->fs_info->closing)
1198 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1199 return ret; 1066 return ret;
1200} 1067}
1201 1068
@@ -1204,16 +1071,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1204 */ 1071 */
1205int btrfs_clean_old_snapshots(struct btrfs_root *root) 1072int btrfs_clean_old_snapshots(struct btrfs_root *root)
1206{ 1073{
1207 struct list_head dirty_roots; 1074 LIST_HEAD(list);
1208 INIT_LIST_HEAD(&dirty_roots); 1075 struct btrfs_fs_info *fs_info = root->fs_info;
1209again: 1076
1210 mutex_lock(&root->fs_info->trans_mutex); 1077 mutex_lock(&fs_info->trans_mutex);
1211 list_splice_init(&root->fs_info->dead_roots, &dirty_roots); 1078 list_splice_init(&fs_info->dead_roots, &list);
1212 mutex_unlock(&root->fs_info->trans_mutex); 1079 mutex_unlock(&fs_info->trans_mutex);
1213 1080
1214 if (!list_empty(&dirty_roots)) { 1081 while (!list_empty(&list)) {
1215 drop_dirty_roots(root, &dirty_roots); 1082 root = list_entry(list.next, struct btrfs_root, root_list);
1216 goto again; 1083 list_del_init(&root->root_list);
1084 btrfs_drop_dead_root(root);
1217 } 1085 }
1218 return 0; 1086 return 0;
1219} 1087}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 94f5bde2b58d..961c3ee5a2e1 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,12 +62,6 @@ struct btrfs_pending_snapshot {
62 struct list_head list; 62 struct list_head list;
63}; 63};
64 64
65struct btrfs_dirty_root {
66 struct list_head list;
67 struct btrfs_root *root;
68 struct btrfs_root *latest_root;
69};
70
71static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, 65static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
72 struct inode *inode) 66 struct inode *inode)
73{ 67{
@@ -100,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
100int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 94int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
101 struct btrfs_root *root); 95 struct btrfs_root *root);
102 96
103int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); 97int btrfs_add_dead_root(struct btrfs_root *root);
98int btrfs_drop_dead_root(struct btrfs_root *root);
104int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); 99int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
105int btrfs_clean_old_snapshots(struct btrfs_root *root); 100int btrfs_clean_old_snapshots(struct btrfs_root *root);
106int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 101int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -108,7 +103,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
108int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 103int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root); 104 struct btrfs_root *root);
110void btrfs_throttle(struct btrfs_root *root); 105void btrfs_throttle(struct btrfs_root *root);
111int btrfs_record_root_in_trans(struct btrfs_root *root); 106int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root);
112int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 108int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
113 struct extent_io_tree *dirty_pages); 109 struct extent_io_tree *dirty_pages);
114#endif 110#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index db5e212e8445..c13922206d1b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -430,18 +430,16 @@ no_copy:
430static noinline struct inode *read_one_inode(struct btrfs_root *root, 430static noinline struct inode *read_one_inode(struct btrfs_root *root,
431 u64 objectid) 431 u64 objectid)
432{ 432{
433 struct btrfs_key key;
433 struct inode *inode; 434 struct inode *inode;
434 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
435 if (inode->i_state & I_NEW) {
436 BTRFS_I(inode)->root = root;
437 BTRFS_I(inode)->location.objectid = objectid;
438 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
439 BTRFS_I(inode)->location.offset = 0;
440 btrfs_read_locked_inode(inode);
441 unlock_new_inode(inode);
442 435
443 } 436 key.objectid = objectid;
444 if (is_bad_inode(inode)) { 437 key.type = BTRFS_INODE_ITEM_KEY;
438 key.offset = 0;
439 inode = btrfs_iget(root->fs_info->sb, &key, root);
440 if (IS_ERR(inode)) {
441 inode = NULL;
442 } else if (is_bad_inode(inode)) {
445 iput(inode); 443 iput(inode);
446 inode = NULL; 444 inode = NULL;
447 } 445 }
@@ -541,6 +539,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
541 539
542 if (found_type == BTRFS_FILE_EXTENT_REG || 540 if (found_type == BTRFS_FILE_EXTENT_REG ||
543 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 541 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
542 u64 offset;
544 unsigned long dest_offset; 543 unsigned long dest_offset;
545 struct btrfs_key ins; 544 struct btrfs_key ins;
546 545
@@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
555 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 554 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
556 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 555 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
557 ins.type = BTRFS_EXTENT_ITEM_KEY; 556 ins.type = BTRFS_EXTENT_ITEM_KEY;
557 offset = key->offset - btrfs_file_extent_offset(eb, item);
558 558
559 if (ins.objectid > 0) { 559 if (ins.objectid > 0) {
560 u64 csum_start; 560 u64 csum_start;
@@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
569 if (ret == 0) { 569 if (ret == 0) {
570 ret = btrfs_inc_extent_ref(trans, root, 570 ret = btrfs_inc_extent_ref(trans, root,
571 ins.objectid, ins.offset, 571 ins.objectid, ins.offset,
572 path->nodes[0]->start, 572 0, root->root_key.objectid,
573 root->root_key.objectid, 573 key->objectid, offset);
574 trans->transid, key->objectid);
575 } else { 574 } else {
576 /* 575 /*
577 * insert the extent pointer in the extent 576 * insert the extent pointer in the extent
578 * allocation tree 577 * allocation tree
579 */ 578 */
580 ret = btrfs_alloc_logged_extent(trans, root, 579 ret = btrfs_alloc_logged_file_extent(trans,
581 path->nodes[0]->start, 580 root, root->root_key.objectid,
582 root->root_key.objectid, 581 key->objectid, offset, &ins);
583 trans->transid, key->objectid,
584 &ins);
585 BUG_ON(ret); 582 BUG_ON(ret);
586 } 583 }
587 btrfs_release_path(root, path); 584 btrfs_release_path(root, path);
@@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1706 btrfs_wait_tree_block_writeback(next); 1703 btrfs_wait_tree_block_writeback(next);
1707 btrfs_tree_unlock(next); 1704 btrfs_tree_unlock(next);
1708 1705
1709 ret = btrfs_drop_leaf_ref(trans, root, next);
1710 BUG_ON(ret);
1711
1712 WARN_ON(root_owner != 1706 WARN_ON(root_owner !=
1713 BTRFS_TREE_LOG_OBJECTID); 1707 BTRFS_TREE_LOG_OBJECTID);
1714 ret = btrfs_free_reserved_extent(root, 1708 ret = btrfs_free_reserved_extent(root,
@@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1753 btrfs_wait_tree_block_writeback(next); 1747 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next); 1748 btrfs_tree_unlock(next);
1755 1749
1756 if (*level == 0) {
1757 ret = btrfs_drop_leaf_ref(trans, root, next);
1758 BUG_ON(ret);
1759 }
1760 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1750 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1761 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1751 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1762 BUG_ON(ret); 1752 BUG_ON(ret);
@@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1811 btrfs_wait_tree_block_writeback(next); 1801 btrfs_wait_tree_block_writeback(next);
1812 btrfs_tree_unlock(next); 1802 btrfs_tree_unlock(next);
1813 1803
1814 if (*level == 0) {
1815 ret = btrfs_drop_leaf_ref(trans, root,
1816 next);
1817 BUG_ON(ret);
1818 }
1819
1820 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1804 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1821 ret = btrfs_free_reserved_extent(root, 1805 ret = btrfs_free_reserved_extent(root,
1822 path->nodes[*level]->start, 1806 path->nodes[*level]->start,
@@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1884 btrfs_wait_tree_block_writeback(next); 1868 btrfs_wait_tree_block_writeback(next);
1885 btrfs_tree_unlock(next); 1869 btrfs_tree_unlock(next);
1886 1870
1887 if (orig_level == 0) {
1888 ret = btrfs_drop_leaf_ref(trans, log,
1889 next);
1890 BUG_ON(ret);
1891 }
1892 WARN_ON(log->root_key.objectid != 1871 WARN_ON(log->root_key.objectid !=
1893 BTRFS_TREE_LOG_OBJECTID); 1872 BTRFS_TREE_LOG_OBJECTID);
1894 ret = btrfs_free_reserved_extent(log, next->start, 1873 ret = btrfs_free_reserved_extent(log, next->start,
@@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2027 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2006 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
2028 BUG_ON(ret); 2007 BUG_ON(ret);
2029 2008
2030 btrfs_set_root_bytenr(&log->root_item, log->node->start); 2009 btrfs_set_root_node(&log->root_item, log->node);
2031 btrfs_set_root_generation(&log->root_item, trans->transid);
2032 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2033 2010
2034 root->log_batch = 0; 2011 root->log_batch = 0;
2035 root->log_transid++; 2012 root->log_transid++;
@@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2581 ins_keys, ins_sizes, nr); 2558 ins_keys, ins_sizes, nr);
2582 BUG_ON(ret); 2559 BUG_ON(ret);
2583 2560
2584 for (i = 0; i < nr; i++) { 2561 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2585 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2562 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2586 dst_path->slots[0]); 2563 dst_path->slots[0]);
2587 2564
@@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2617 found_type = btrfs_file_extent_type(src, extent); 2594 found_type = btrfs_file_extent_type(src, extent);
2618 if (found_type == BTRFS_FILE_EXTENT_REG || 2595 if (found_type == BTRFS_FILE_EXTENT_REG ||
2619 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2596 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2620 u64 ds = btrfs_file_extent_disk_bytenr(src, 2597 u64 ds, dl, cs, cl;
2621 extent); 2598 ds = btrfs_file_extent_disk_bytenr(src,
2622 u64 dl = btrfs_file_extent_disk_num_bytes(src, 2599 extent);
2623 extent); 2600 /* ds == 0 is a hole */
2624 u64 cs = btrfs_file_extent_offset(src, extent); 2601 if (ds == 0)
2625 u64 cl = btrfs_file_extent_num_bytes(src, 2602 continue;
2626 extent);; 2603
2604 dl = btrfs_file_extent_disk_num_bytes(src,
2605 extent);
2606 cs = btrfs_file_extent_offset(src, extent);
2607 cl = btrfs_file_extent_num_bytes(src,
2608 extent);;
2627 if (btrfs_file_extent_compression(src, 2609 if (btrfs_file_extent_compression(src,
2628 extent)) { 2610 extent)) {
2629 cs = 0; 2611 cs = 0;
2630 cl = dl; 2612 cl = dl;
2631 } 2613 }
2632 /* ds == 0 is a hole */ 2614
2633 if (ds != 0) { 2615 ret = btrfs_lookup_csums_range(
2634 ret = btrfs_inc_extent_ref(trans, log, 2616 log->fs_info->csum_root,
2635 ds, dl, 2617 ds + cs, ds + cs + cl - 1,
2636 dst_path->nodes[0]->start, 2618 &ordered_sums);
2637 BTRFS_TREE_LOG_OBJECTID, 2619 BUG_ON(ret);
2638 trans->transid,
2639 ins_keys[i].objectid);
2640 BUG_ON(ret);
2641 ret = btrfs_lookup_csums_range(
2642 log->fs_info->csum_root,
2643 ds + cs, ds + cs + cl - 1,
2644 &ordered_sums);
2645 BUG_ON(ret);
2646 }
2647 } 2620 }
2648 } 2621 }
2649 dst_path->slots[0]++;
2650 } 2622 }
2651 2623
2652 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2624 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
@@ -3029,9 +3001,7 @@ again:
3029 BUG_ON(!wc.replay_dest); 3001 BUG_ON(!wc.replay_dest);
3030 3002
3031 wc.replay_dest->log_root = log; 3003 wc.replay_dest->log_root = log;
3032 mutex_lock(&fs_info->trans_mutex); 3004 btrfs_record_root_in_trans(trans, wc.replay_dest);
3033 btrfs_record_root_in_trans(wc.replay_dest);
3034 mutex_unlock(&fs_info->trans_mutex);
3035 ret = walk_log_tree(trans, log, &wc); 3005 ret = walk_log_tree(trans, log, &wc);
3036 BUG_ON(ret); 3006 BUG_ON(ret);
3037 3007
@@ -3049,6 +3019,7 @@ again:
3049 key.offset = found_key.offset - 1; 3019 key.offset = found_key.offset - 1;
3050 wc.replay_dest->log_root = NULL; 3020 wc.replay_dest->log_root = NULL;
3051 free_extent_buffer(log->node); 3021 free_extent_buffer(log->node);
3022 free_extent_buffer(log->commit_root);
3052 kfree(log); 3023 kfree(log);
3053 3024
3054 if (found_key.offset == 0) 3025 if (found_key.offset == 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a6d35b0054ca..3ab80e9cd767 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -161,8 +161,10 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
161 int again = 0; 161 int again = 0;
162 unsigned long num_run; 162 unsigned long num_run;
163 unsigned long num_sync_run; 163 unsigned long num_sync_run;
164 unsigned long batch_run = 0;
164 unsigned long limit; 165 unsigned long limit;
165 unsigned long last_waited = 0; 166 unsigned long last_waited = 0;
167 int force_reg = 0;
166 168
167 bdi = blk_get_backing_dev_info(device->bdev); 169 bdi = blk_get_backing_dev_info(device->bdev);
168 fs_info = device->dev_root->fs_info; 170 fs_info = device->dev_root->fs_info;
@@ -176,19 +178,22 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
176 178
177loop: 179loop:
178 spin_lock(&device->io_lock); 180 spin_lock(&device->io_lock);
179 num_run = 0;
180 181
181loop_lock: 182loop_lock:
183 num_run = 0;
182 184
183 /* take all the bios off the list at once and process them 185 /* take all the bios off the list at once and process them
184 * later on (without the lock held). But, remember the 186 * later on (without the lock held). But, remember the
185 * tail and other pointers so the bios can be properly reinserted 187 * tail and other pointers so the bios can be properly reinserted
186 * into the list if we hit congestion 188 * into the list if we hit congestion
187 */ 189 */
188 if (device->pending_sync_bios.head) 190 if (!force_reg && device->pending_sync_bios.head) {
189 pending_bios = &device->pending_sync_bios; 191 pending_bios = &device->pending_sync_bios;
190 else 192 force_reg = 1;
193 } else {
191 pending_bios = &device->pending_bios; 194 pending_bios = &device->pending_bios;
195 force_reg = 0;
196 }
192 197
193 pending = pending_bios->head; 198 pending = pending_bios->head;
194 tail = pending_bios->tail; 199 tail = pending_bios->tail;
@@ -228,10 +233,14 @@ loop_lock:
228 while (pending) { 233 while (pending) {
229 234
230 rmb(); 235 rmb();
231 if (pending_bios != &device->pending_sync_bios && 236 /* we want to work on both lists, but do more bios on the
232 device->pending_sync_bios.head && 237 * sync list than the regular list
233 num_run > 16) { 238 */
234 cond_resched(); 239 if ((num_run > 32 &&
240 pending_bios != &device->pending_sync_bios &&
241 device->pending_sync_bios.head) ||
242 (num_run > 64 && pending_bios == &device->pending_sync_bios &&
243 device->pending_bios.head)) {
235 spin_lock(&device->io_lock); 244 spin_lock(&device->io_lock);
236 requeue_list(pending_bios, pending, tail); 245 requeue_list(pending_bios, pending, tail);
237 goto loop_lock; 246 goto loop_lock;
@@ -249,6 +258,8 @@ loop_lock:
249 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 258 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
250 submit_bio(cur->bi_rw, cur); 259 submit_bio(cur->bi_rw, cur);
251 num_run++; 260 num_run++;
261 batch_run++;
262
252 if (bio_sync(cur)) 263 if (bio_sync(cur))
253 num_sync_run++; 264 num_sync_run++;
254 265
@@ -265,7 +276,7 @@ loop_lock:
265 * is now congested. Back off and let other work structs 276 * is now congested. Back off and let other work structs
266 * run instead 277 * run instead
267 */ 278 */
268 if (pending && bdi_write_congested(bdi) && num_run > 16 && 279 if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
269 fs_info->fs_devices->open_devices > 1) { 280 fs_info->fs_devices->open_devices > 1) {
270 struct io_context *ioc; 281 struct io_context *ioc;
271 282
@@ -366,6 +377,7 @@ static noinline int device_list_add(const char *path,
366 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 377 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
367 fs_devices->latest_devid = devid; 378 fs_devices->latest_devid = devid;
368 fs_devices->latest_trans = found_transid; 379 fs_devices->latest_trans = found_transid;
380 mutex_init(&fs_devices->device_list_mutex);
369 device = NULL; 381 device = NULL;
370 } else { 382 } else {
371 device = __find_device(&fs_devices->devices, devid, 383 device = __find_device(&fs_devices->devices, devid,
@@ -392,7 +404,11 @@ static noinline int device_list_add(const char *path,
392 return -ENOMEM; 404 return -ENOMEM;
393 } 405 }
394 INIT_LIST_HEAD(&device->dev_alloc_list); 406 INIT_LIST_HEAD(&device->dev_alloc_list);
407
408 mutex_lock(&fs_devices->device_list_mutex);
395 list_add(&device->dev_list, &fs_devices->devices); 409 list_add(&device->dev_list, &fs_devices->devices);
410 mutex_unlock(&fs_devices->device_list_mutex);
411
396 device->fs_devices = fs_devices; 412 device->fs_devices = fs_devices;
397 fs_devices->num_devices++; 413 fs_devices->num_devices++;
398 } 414 }
@@ -418,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
418 INIT_LIST_HEAD(&fs_devices->devices); 434 INIT_LIST_HEAD(&fs_devices->devices);
419 INIT_LIST_HEAD(&fs_devices->alloc_list); 435 INIT_LIST_HEAD(&fs_devices->alloc_list);
420 INIT_LIST_HEAD(&fs_devices->list); 436 INIT_LIST_HEAD(&fs_devices->list);
437 mutex_init(&fs_devices->device_list_mutex);
421 fs_devices->latest_devid = orig->latest_devid; 438 fs_devices->latest_devid = orig->latest_devid;
422 fs_devices->latest_trans = orig->latest_trans; 439 fs_devices->latest_trans = orig->latest_trans;
423 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 440 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
424 441
442 mutex_lock(&orig->device_list_mutex);
425 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 443 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
426 device = kzalloc(sizeof(*device), GFP_NOFS); 444 device = kzalloc(sizeof(*device), GFP_NOFS);
427 if (!device) 445 if (!device)
@@ -443,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
443 device->fs_devices = fs_devices; 461 device->fs_devices = fs_devices;
444 fs_devices->num_devices++; 462 fs_devices->num_devices++;
445 } 463 }
464 mutex_unlock(&orig->device_list_mutex);
446 return fs_devices; 465 return fs_devices;
447error: 466error:
467 mutex_unlock(&orig->device_list_mutex);
448 free_fs_devices(fs_devices); 468 free_fs_devices(fs_devices);
449 return ERR_PTR(-ENOMEM); 469 return ERR_PTR(-ENOMEM);
450} 470}
@@ -455,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
455 475
456 mutex_lock(&uuid_mutex); 476 mutex_lock(&uuid_mutex);
457again: 477again:
478 mutex_lock(&fs_devices->device_list_mutex);
458 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 479 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
459 if (device->in_fs_metadata) 480 if (device->in_fs_metadata)
460 continue; 481 continue;
@@ -474,6 +495,7 @@ again:
474 kfree(device->name); 495 kfree(device->name);
475 kfree(device); 496 kfree(device);
476 } 497 }
498 mutex_unlock(&fs_devices->device_list_mutex);
477 499
478 if (fs_devices->seed) { 500 if (fs_devices->seed) {
479 fs_devices = fs_devices->seed; 501 fs_devices = fs_devices->seed;
@@ -594,6 +616,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
594 device->in_fs_metadata = 0; 616 device->in_fs_metadata = 0;
595 device->mode = flags; 617 device->mode = flags;
596 618
619 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
620 fs_devices->rotating = 1;
621
597 fs_devices->open_devices++; 622 fs_devices->open_devices++;
598 if (device->writeable) { 623 if (device->writeable) {
599 fs_devices->rw_devices++; 624 fs_devices->rw_devices++;
@@ -1121,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1121 1146
1122 device = NULL; 1147 device = NULL;
1123 devices = &root->fs_info->fs_devices->devices; 1148 devices = &root->fs_info->fs_devices->devices;
1149 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1124 list_for_each_entry(tmp, devices, dev_list) { 1150 list_for_each_entry(tmp, devices, dev_list) {
1125 if (tmp->in_fs_metadata && !tmp->bdev) { 1151 if (tmp->in_fs_metadata && !tmp->bdev) {
1126 device = tmp; 1152 device = tmp;
1127 break; 1153 break;
1128 } 1154 }
1129 } 1155 }
1156 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1130 bdev = NULL; 1157 bdev = NULL;
1131 bh = NULL; 1158 bh = NULL;
1132 disk_super = NULL; 1159 disk_super = NULL;
@@ -1181,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1181 goto error_brelse; 1208 goto error_brelse;
1182 1209
1183 device->in_fs_metadata = 0; 1210 device->in_fs_metadata = 0;
1211
1212 /*
1213 * the device list mutex makes sure that we don't change
1214 * the device list while someone else is writing out all
1215 * the device supers.
1216 */
1217 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1184 list_del_init(&device->dev_list); 1218 list_del_init(&device->dev_list);
1219 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1220
1185 device->fs_devices->num_devices--; 1221 device->fs_devices->num_devices--;
1186 1222
1187 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1223 next_device = list_entry(root->fs_info->fs_devices->devices.next,
@@ -1275,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1275 seed_devices->opened = 1; 1311 seed_devices->opened = 1;
1276 INIT_LIST_HEAD(&seed_devices->devices); 1312 INIT_LIST_HEAD(&seed_devices->devices);
1277 INIT_LIST_HEAD(&seed_devices->alloc_list); 1313 INIT_LIST_HEAD(&seed_devices->alloc_list);
1314 mutex_init(&seed_devices->device_list_mutex);
1278 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1315 list_splice_init(&fs_devices->devices, &seed_devices->devices);
1279 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1316 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1280 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1317 list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1400,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1400 mutex_lock(&root->fs_info->volume_mutex); 1437 mutex_lock(&root->fs_info->volume_mutex);
1401 1438
1402 devices = &root->fs_info->fs_devices->devices; 1439 devices = &root->fs_info->fs_devices->devices;
1440 /*
1441 * we have the volume lock, so we don't need the extra
1442 * device list mutex while reading the list here.
1443 */
1403 list_for_each_entry(device, devices, dev_list) { 1444 list_for_each_entry(device, devices, dev_list) {
1404 if (device->bdev == bdev) { 1445 if (device->bdev == bdev) {
1405 ret = -EEXIST; 1446 ret = -EEXIST;
@@ -1454,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1454 } 1495 }
1455 1496
1456 device->fs_devices = root->fs_info->fs_devices; 1497 device->fs_devices = root->fs_info->fs_devices;
1498
1499 /*
1500 * we don't want write_supers to jump in here with our device
1501 * half setup
1502 */
1503 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1457 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1504 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1458 list_add(&device->dev_alloc_list, 1505 list_add(&device->dev_alloc_list,
1459 &root->fs_info->fs_devices->alloc_list); 1506 &root->fs_info->fs_devices->alloc_list);
@@ -1462,6 +1509,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1462 root->fs_info->fs_devices->rw_devices++; 1509 root->fs_info->fs_devices->rw_devices++;
1463 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1510 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1464 1511
1512 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1513 root->fs_info->fs_devices->rotating = 1;
1514
1465 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1515 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1466 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1516 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1467 total_bytes + device->total_bytes); 1517 total_bytes + device->total_bytes);
@@ -1469,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1469 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1519 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1470 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1520 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1471 total_bytes + 1); 1521 total_bytes + 1);
1522 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1472 1523
1473 if (seeding_dev) { 1524 if (seeding_dev) {
1474 ret = init_first_rw_device(trans, root, device); 1525 ret = init_first_rw_device(trans, root, device);
@@ -1671,8 +1722,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1671 int ret; 1722 int ret;
1672 int i; 1723 int i;
1673 1724
1674 printk(KERN_INFO "btrfs relocating chunk %llu\n",
1675 (unsigned long long)chunk_offset);
1676 root = root->fs_info->chunk_root; 1725 root = root->fs_info->chunk_root;
1677 extent_root = root->fs_info->extent_root; 1726 extent_root = root->fs_info->extent_root;
1678 em_tree = &root->fs_info->mapping_tree.map_tree; 1727 em_tree = &root->fs_info->mapping_tree.map_tree;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5c3ff6d02fd7..5139a833f721 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -96,7 +96,12 @@ struct btrfs_fs_devices {
96 u64 rw_devices; 96 u64 rw_devices;
97 u64 total_rw_bytes; 97 u64 total_rw_bytes;
98 struct block_device *latest_bdev; 98 struct block_device *latest_bdev;
99 /* all of the devices in the FS */ 99
100 /* all of the devices in the FS, protected by a mutex
101 * so we can safely walk it to write out the supers without
102 * worrying about add/remove by the multi-device code
103 */
104 struct mutex device_list_mutex;
100 struct list_head devices; 105 struct list_head devices;
101 106
102 /* devices not currently being allocated */ 107 /* devices not currently being allocated */
@@ -107,6 +112,11 @@ struct btrfs_fs_devices {
107 int seeding; 112 int seeding;
108 113
109 int opened; 114 int opened;
115
116 /* set when we find or add a device that doesn't have the
117 * nonrot flag set
118 */
119 int rotating;
110}; 120};
111 121
112struct btrfs_bio_stripe { 122struct btrfs_bio_stripe {
diff --git a/fs/buffer.c b/fs/buffer.c
index 49106127a4aa..a3ef091a45bd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
1085__getblk_slow(struct block_device *bdev, sector_t block, int size) 1085__getblk_slow(struct block_device *bdev, sector_t block, int size)
1086{ 1086{
1087 /* Size must be multiple of hard sectorsize */ 1087 /* Size must be multiple of hard sectorsize */
1088 if (unlikely(size & (bdev_hardsect_size(bdev)-1) || 1088 if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1089 (size < 512 || size > PAGE_SIZE))) { 1089 (size < 512 || size > PAGE_SIZE))) {
1090 printk(KERN_ERR "getblk(): invalid block size %d requested\n", 1090 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1091 size); 1091 size);
1092 printk(KERN_ERR "hardsect size: %d\n", 1092 printk(KERN_ERR "logical block size: %d\n",
1093 bdev_hardsect_size(bdev)); 1093 bdev_logical_block_size(bdev));
1094 1094
1095 dump_stack(); 1095 dump_stack();
1096 return NULL; 1096 return NULL;
@@ -2935,6 +2935,8 @@ int submit_bh(int rw, struct buffer_head * bh)
2935 BUG_ON(!buffer_locked(bh)); 2935 BUG_ON(!buffer_locked(bh));
2936 BUG_ON(!buffer_mapped(bh)); 2936 BUG_ON(!buffer_mapped(bh));
2937 BUG_ON(!bh->b_end_io); 2937 BUG_ON(!bh->b_end_io);
2938 BUG_ON(buffer_delay(bh));
2939 BUG_ON(buffer_unwritten(bh));
2938 2940
2939 /* 2941 /*
2940 * Mask in barrier bit for a write (could be either a WRITE or a 2942 * Mask in barrier bit for a write (could be either a WRITE or a
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1e962348d111..431accd475a7 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,9 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
354 /* make sure all pages pinned by operations on behalf of the netfs are 354 /* make sure all pages pinned by operations on behalf of the netfs are
355 * written to disc */ 355 * written to disc */
356 cachefiles_begin_secure(cache, &saved_cred); 356 cachefiles_begin_secure(cache, &saved_cred);
357 ret = fsync_super(cache->mnt->mnt_sb); 357 down_read(&cache->mnt->mnt_sb->s_umount);
358 ret = sync_filesystem(cache->mnt->mnt_sb);
359 up_read(&cache->mnt->mnt_sb->s_umount);
358 cachefiles_end_secure(cache, saved_cred); 360 cachefiles_end_secure(cache, saved_cred);
359 361
360 if (ret == -EIO) 362 if (ret == -EIO)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 38f71222a552..b7c9d5187a75 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -375,7 +375,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
375 p = inode->i_cdev; 375 p = inode->i_cdev;
376 if (!p) { 376 if (!p) {
377 inode->i_cdev = p = new; 377 inode->i_cdev = p = new;
378 inode->i_cindex = idx;
379 list_add(&inode->i_devices, &p->list); 378 list_add(&inode->i_devices, &p->list);
380 new = NULL; 379 new = NULL;
381 } else if (!cdev_get(p)) 380 } else if (!cdev_get(p))
@@ -405,6 +404,18 @@ static int chrdev_open(struct inode *inode, struct file *filp)
405 return ret; 404 return ret;
406} 405}
407 406
407int cdev_index(struct inode *inode)
408{
409 int idx;
410 struct kobject *kobj;
411
412 kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
413 if (!kobj)
414 return -1;
415 kobject_put(kobj);
416 return idx;
417}
418
408void cd_forget(struct inode *inode) 419void cd_forget(struct inode *inode)
409{ 420{
410 spin_lock(&cdev_lock); 421 spin_lock(&cdev_lock);
@@ -557,6 +568,7 @@ EXPORT_SYMBOL(cdev_init);
557EXPORT_SYMBOL(cdev_alloc); 568EXPORT_SYMBOL(cdev_alloc);
558EXPORT_SYMBOL(cdev_del); 569EXPORT_SYMBOL(cdev_del);
559EXPORT_SYMBOL(cdev_add); 570EXPORT_SYMBOL(cdev_add);
571EXPORT_SYMBOL(cdev_index);
560EXPORT_SYMBOL(register_chrdev); 572EXPORT_SYMBOL(register_chrdev);
561EXPORT_SYMBOL(unregister_chrdev); 573EXPORT_SYMBOL(unregister_chrdev);
562EXPORT_SYMBOL(directly_mappable_cdev_bdi); 574EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f20c4069c220..b48689839428 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,12 @@
1Version 1.59
2------------
3Client uses server inode numbers (which are persistent) rather than
4client generated ones by default (mount option "serverino" turned
5on by default if server supports it). Add forceuid and forcegid
6mount options (so that when negotiating unix extensions specifying
7which uid mounted does not immediately force the server's reported
8uids to be overridden).
9
1Version 1.58 10Version 1.58
2------------ 11------------
3Guard against buffer overruns in various UCS-2 to UTF-8 string conversions 12Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
@@ -10,6 +19,8 @@ we converted from). Fix endianness of the vcnum field used during
10session setup to distinguish multiple mounts to same server from different 19session setup to distinguish multiple mounts to same server from different
11userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental 20userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
12flag to be set to 2, and mount must enable krb5 to turn on extended security). 21flag to be set to 2, and mount must enable krb5 to turn on extended security).
22Performance of file create to Samba improved (posix create on lookup
23removes 1 of 2 network requests sent on file create)
13 24
14Version 1.57 25Version 1.57
15------------ 26------------
diff --git a/fs/cifs/README b/fs/cifs/README
index db208ddb9899..ad92921dbde4 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,7 +262,8 @@ A partial list of the supported mount options follows:
262 mount. 262 mount.
263 domain Set the SMB/CIFS workgroup name prepended to the 263 domain Set the SMB/CIFS workgroup name prepended to the
264 username during CIFS session establishment 264 username during CIFS session establishment
265 uid Set the default uid for inodes. For mounts to servers 265 forceuid Set the default uid for inodes based on the uid
266 passed in. For mounts to servers
266 which do support the CIFS Unix extensions, such as a 267 which do support the CIFS Unix extensions, such as a
267 properly configured Samba server, the server provides 268 properly configured Samba server, the server provides
268 the uid, gid and mode so this parameter should not be 269 the uid, gid and mode so this parameter should not be
@@ -292,6 +293,12 @@ A partial list of the supported mount options follows:
292 the client. Note that the mount.cifs helper must be 293 the client. Note that the mount.cifs helper must be
293 at version 1.10 or higher to support specifying the uid 294 at version 1.10 or higher to support specifying the uid
294 (or gid) in non-numeric form. 295 (or gid) in non-numeric form.
296 forcegid (similar to above but for the groupid instead of uid)
297 uid Set the default uid for inodes, and indicate to the
298 cifs kernel driver which local user mounted . If the server
299 supports the unix extensions the default uid is
300 not used to fill in the owner fields of inodes (files)
301 unless the "forceuid" parameter is specified.
295 gid Set the default gid for inodes (similar to above). 302 gid Set the default gid for inodes (similar to above).
296 file_mode If CIFS Unix extensions are not supported by the server 303 file_mode If CIFS Unix extensions are not supported by the server
297 this overrides the default mode for file inodes. 304 this overrides the default mode for file inodes.
@@ -388,8 +395,13 @@ A partial list of the supported mount options follows:
388 or the CIFS Unix Extensions equivalent and for those 395 or the CIFS Unix Extensions equivalent and for those
389 this mount option will have no effect. Exporting cifs mounts 396 this mount option will have no effect. Exporting cifs mounts
390 under nfsd requires this mount option on the cifs mount. 397 under nfsd requires this mount option on the cifs mount.
398 This is now the default if server supports the
399 required network operation.
391 noserverino Client generates inode numbers (rather than using the actual one 400 noserverino Client generates inode numbers (rather than using the actual one
392 from the server) by default. 401 from the server). These inode numbers will vary after
402 unmount or reboot which can confuse some applications,
403 but not all server filesystems support unique inode
404 numbers.
393 setuids If the CIFS Unix extensions are negotiated with the server 405 setuids If the CIFS Unix extensions are negotiated with the server
394 the client will attempt to set the effective uid and gid of 406 the client will attempt to set the effective uid and gid of
395 the local process on newly created files, directories, and 407 the local process on newly created files, directories, and
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 83d62759c7c7..3bb11be8b6a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -275,7 +275,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
275 case -EBUSY: 275 case -EBUSY:
276 /* someone else made a mount here whilst we were busy */ 276 /* someone else made a mount here whilst we were busy */
277 while (d_mountpoint(nd->path.dentry) && 277 while (d_mountpoint(nd->path.dentry) &&
278 follow_down(&nd->path.mnt, &nd->path.dentry)) 278 follow_down(&nd->path))
279 ; 279 ;
280 err = 0; 280 err = 0;
281 default: 281 default:
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 67bf93a40d2e..4a4581cb2b5e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -23,6 +23,7 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <keys/user-type.h> 24#include <keys/user-type.h>
25#include <linux/key-type.h> 25#include <linux/key-type.h>
26#include <linux/inet.h>
26#include "cifsglob.h" 27#include "cifsglob.h"
27#include "cifs_spnego.h" 28#include "cifs_spnego.h"
28#include "cifs_debug.h" 29#include "cifs_debug.h"
@@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = {
73 * strlen(";sec=ntlmsspi") */ 74 * strlen(";sec=ntlmsspi") */
74#define MAX_MECH_STR_LEN 13 75#define MAX_MECH_STR_LEN 13
75 76
76/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
77#define MAX_IPV6_ADDR_LEN 43
78
79/* strlen of "host=" */ 77/* strlen of "host=" */
80#define HOST_KEY_LEN 5 78#define HOST_KEY_LEN 5
81 79
@@ -102,7 +100,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
102 host=hostname sec=mechanism uid=0xFF user=username */ 100 host=hostname sec=mechanism uid=0xFF user=username */
103 desc_len = MAX_VER_STR_LEN + 101 desc_len = MAX_VER_STR_LEN +
104 HOST_KEY_LEN + strlen(hostname) + 102 HOST_KEY_LEN + strlen(hostname) +
105 IP_KEY_LEN + MAX_IPV6_ADDR_LEN + 103 IP_KEY_LEN + INET6_ADDRSTRLEN +
106 MAX_MECH_STR_LEN + 104 MAX_MECH_STR_LEN +
107 UID_KEY_LEN + (sizeof(uid_t) * 2) + 105 UID_KEY_LEN + (sizeof(uid_t) * 2) +
108 USER_KEY_LEN + strlen(sesInfo->userName) + 1; 106 USER_KEY_LEN + strlen(sesInfo->userName) + 1;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 57ecdc83c26f..1403b5d86a73 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -552,130 +552,138 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
552 return rc; 552 return rc;
553} 553}
554 554
555 555static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
556/* Retrieve an ACL from the server */ 556 __u16 fid, u32 *pacllen)
557static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
558 const char *path, const __u16 *pfid)
559{ 557{
560 struct cifsFileInfo *open_file = NULL;
561 bool unlock_file = false;
562 int xid;
563 int rc = -EIO;
564 __u16 fid;
565 struct super_block *sb;
566 struct cifs_sb_info *cifs_sb;
567 struct cifs_ntsd *pntsd = NULL; 558 struct cifs_ntsd *pntsd = NULL;
559 int xid, rc;
560
561 xid = GetXid();
562 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
563 FreeXid(xid);
568 564
569 cFYI(1, ("get mode from ACL for %s", path));
570 565
571 if (inode == NULL) 566 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
572 return NULL; 567 return pntsd;
568}
569
570static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
571 const char *path, u32 *pacllen)
572{
573 struct cifs_ntsd *pntsd = NULL;
574 int oplock = 0;
575 int xid, rc;
576 __u16 fid;
573 577
574 xid = GetXid(); 578 xid = GetXid();
575 if (pfid == NULL)
576 open_file = find_readable_file(CIFS_I(inode));
577 else
578 fid = *pfid;
579 579
580 sb = inode->i_sb; 580 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
581 if (sb == NULL) { 581 &fid, &oplock, NULL, cifs_sb->local_nls,
582 FreeXid(xid); 582 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
583 return NULL; 583 if (rc) {
584 } 584 cERROR(1, ("Unable to open file to get ACL"));
585 cifs_sb = CIFS_SB(sb); 585 goto out;
586
587 if (open_file) {
588 unlock_file = true;
589 fid = open_file->netfid;
590 } else if (pfid == NULL) {
591 int oplock = 0;
592 /* open file */
593 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
594 READ_CONTROL, 0, &fid, &oplock, NULL,
595 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
596 CIFS_MOUNT_MAP_SPECIAL_CHR);
597 if (rc != 0) {
598 cERROR(1, ("Unable to open file to get ACL"));
599 FreeXid(xid);
600 return NULL;
601 }
602 } 586 }
603 587
604 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 588 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
605 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); 589 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
606 if (unlock_file == true) /* find_readable_file increments ref count */
607 atomic_dec(&open_file->wrtPending);
608 else if (pfid == NULL) /* if opened above we have to close the handle */
609 CIFSSMBClose(xid, cifs_sb->tcon, fid);
610 /* else handle was passed in by caller */
611 590
591 CIFSSMBClose(xid, cifs_sb->tcon, fid);
592 out:
612 FreeXid(xid); 593 FreeXid(xid);
613 return pntsd; 594 return pntsd;
614} 595}
615 596
616/* Set an ACL on the server */ 597/* Retrieve an ACL from the server */
617static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, 598static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
618 struct inode *inode, const char *path) 599 struct inode *inode, const char *path,
600 u32 *pacllen)
619{ 601{
620 struct cifsFileInfo *open_file; 602 struct cifs_ntsd *pntsd = NULL;
621 bool unlock_file = false; 603 struct cifsFileInfo *open_file = NULL;
622 int xid;
623 int rc = -EIO;
624 __u16 fid;
625 struct super_block *sb;
626 struct cifs_sb_info *cifs_sb;
627 604
628 cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode)); 605 if (inode)
606 open_file = find_readable_file(CIFS_I(inode));
607 if (!open_file)
608 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
629 609
630 if (!inode) 610 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
631 return rc; 611 atomic_dec(&open_file->wrtPending);
612 return pntsd;
613}
632 614
633 sb = inode->i_sb; 615static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
634 if (sb == NULL) 616 struct cifs_ntsd *pnntsd, u32 acllen)
635 return rc; 617{
618 int xid, rc;
636 619
637 cifs_sb = CIFS_SB(sb);
638 xid = GetXid(); 620 xid = GetXid();
621 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
622 FreeXid(xid);
639 623
640 open_file = find_readable_file(CIFS_I(inode)); 624 cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
641 if (open_file) { 625 return rc;
642 unlock_file = true; 626}
643 fid = open_file->netfid; 627
644 } else { 628static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
645 int oplock = 0; 629 struct cifs_ntsd *pnntsd, u32 acllen)
646 /* open file */ 630{
647 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, 631 int oplock = 0;
648 WRITE_DAC, 0, &fid, &oplock, NULL, 632 int xid, rc;
649 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 633 __u16 fid;
650 CIFS_MOUNT_MAP_SPECIAL_CHR); 634
651 if (rc != 0) { 635 xid = GetXid();
652 cERROR(1, ("Unable to open file to set ACL")); 636
653 FreeXid(xid); 637 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
654 return rc; 638 &fid, &oplock, NULL, cifs_sb->local_nls,
655 } 639 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
640 if (rc) {
641 cERROR(1, ("Unable to open file to set ACL"));
642 goto out;
656 } 643 }
657 644
658 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 645 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
659 cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); 646 cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
660 if (unlock_file)
661 atomic_dec(&open_file->wrtPending);
662 else
663 CIFSSMBClose(xid, cifs_sb->tcon, fid);
664 647
648 CIFSSMBClose(xid, cifs_sb->tcon, fid);
649 out:
665 FreeXid(xid); 650 FreeXid(xid);
651 return rc;
652}
666 653
654/* Set an ACL on the server */
655static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
656 struct inode *inode, const char *path)
657{
658 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
659 struct cifsFileInfo *open_file;
660 int rc;
661
662 cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
663
664 open_file = find_readable_file(CIFS_I(inode));
665 if (!open_file)
666 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
667
668 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
669 atomic_dec(&open_file->wrtPending);
667 return rc; 670 return rc;
668} 671}
669 672
670/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 673/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
671void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid) 674void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
675 const char *path, const __u16 *pfid)
672{ 676{
673 struct cifs_ntsd *pntsd = NULL; 677 struct cifs_ntsd *pntsd = NULL;
674 u32 acllen = 0; 678 u32 acllen = 0;
675 int rc = 0; 679 int rc = 0;
676 680
677 cFYI(DBG2, ("converting ACL to mode for %s", path)); 681 cFYI(DBG2, ("converting ACL to mode for %s", path));
678 pntsd = get_cifs_acl(&acllen, inode, path, pfid); 682
683 if (pfid)
684 pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
685 else
686 pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
679 687
680 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ 688 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
681 if (pntsd) 689 if (pntsd)
@@ -698,7 +706,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
698 cFYI(DBG2, ("set ACL from mode for %s", path)); 706 cFYI(DBG2, ("set ACL from mode for %s", path));
699 707
700 /* Get the security descriptor */ 708 /* Get the security descriptor */
701 pntsd = get_cifs_acl(&secdesclen, inode, path, NULL); 709 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
702 710
703 /* Add three ACEs for owner, group, everyone getting rid of 711 /* Add three ACEs for owner, group, everyone getting rid of
704 other ACEs as chmod disables ACEs and set the security descriptor */ 712 other ACEs as chmod disables ACEs and set the security descriptor */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5e6d35804d73..0d92114195ab 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data,
146#endif 146#endif
147 sb->s_blocksize = CIFS_MAX_MSGSIZE; 147 sb->s_blocksize = CIFS_MAX_MSGSIZE;
148 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 148 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
149 inode = cifs_iget(sb, ROOT_I); 149 inode = cifs_root_iget(sb, ROOT_I);
150 150
151 if (IS_ERR(inode)) { 151 if (IS_ERR(inode)) {
152 rc = PTR_ERR(inode); 152 rc = PTR_ERR(inode);
@@ -204,6 +204,9 @@ cifs_put_super(struct super_block *sb)
204 cFYI(1, ("Empty cifs superblock info passed to unmount")); 204 cFYI(1, ("Empty cifs superblock info passed to unmount"));
205 return; 205 return;
206 } 206 }
207
208 lock_kernel();
209
207 rc = cifs_umount(sb, cifs_sb); 210 rc = cifs_umount(sb, cifs_sb);
208 if (rc) 211 if (rc)
209 cERROR(1, ("cifs_umount failed with return code %d", rc)); 212 cERROR(1, ("cifs_umount failed with return code %d", rc));
@@ -216,7 +219,8 @@ cifs_put_super(struct super_block *sb)
216 219
217 unload_nls(cifs_sb->local_nls); 220 unload_nls(cifs_sb->local_nls);
218 kfree(cifs_sb); 221 kfree(cifs_sb);
219 return; 222
223 unlock_kernel();
220} 224}
221 225
222static int 226static int
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 051b71cfdea9..9570a0e8023f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern void cifs_read_inode(struct inode *);
36 36
37/* Functions related to inodes */ 37/* Functions related to inodes */
38extern const struct inode_operations cifs_dir_inode_ops; 38extern const struct inode_operations cifs_dir_inode_ops;
39extern struct inode *cifs_iget(struct super_block *, unsigned long); 39extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
40extern int cifs_create(struct inode *, struct dentry *, int, 40extern int cifs_create(struct inode *, struct dentry *, int,
41 struct nameidata *); 41 struct nameidata *);
42extern struct dentry *cifs_lookup(struct inode *, struct dentry *, 42extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
100extern const struct export_operations cifs_export_ops; 100extern const struct export_operations cifs_export_ops;
101#endif /* EXPERIMENTAL */ 101#endif /* EXPERIMENTAL */
102 102
103#define CIFS_VERSION "1.58" 103#define CIFS_VERSION "1.59"
104#endif /* _CIFSFS_H */ 104#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fae083930eee..f9452329bcce 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -90,10 +90,10 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
90 struct cifsTconInfo *); 90 struct cifsTconInfo *);
91extern void DeleteOplockQEntry(struct oplock_q_entry *); 91extern void DeleteOplockQEntry(struct oplock_q_entry *);
92extern void DeleteTconOplockQEntries(struct cifsTconInfo *); 92extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
93extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601); 93extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
94extern u64 cifs_UnixTimeToNT(struct timespec); 94extern u64 cifs_UnixTimeToNT(struct timespec);
95extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); 95extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
96extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); 96 int offset);
97 97
98extern int cifs_posix_open(char *full_path, struct inode **pinode, 98extern int cifs_posix_open(char *full_path, struct inode **pinode,
99 struct super_block *sb, int mode, int oflags, 99 struct super_block *sb, int mode, int oflags,
@@ -108,8 +108,8 @@ extern int cifs_get_inode_info(struct inode **pinode,
108extern int cifs_get_inode_info_unix(struct inode **pinode, 108extern int cifs_get_inode_info_unix(struct inode **pinode,
109 const unsigned char *search_path, 109 const unsigned char *search_path,
110 struct super_block *sb, int xid); 110 struct super_block *sb, int xid);
111extern void acl_to_uid_mode(struct inode *inode, const char *path, 111extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
112 const __u16 *pfid); 112 const char *path, const __u16 *pfid);
113extern int mode_to_acl(struct inode *inode, const char *path, __u64); 113extern int mode_to_acl(struct inode *inode, const char *path, __u64);
114 114
115extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, 115extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d06260251c30..b84c61d5bca4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
524 int val, seconds, remain, result; 524 int val, seconds, remain, result;
525 struct timespec ts, utc; 525 struct timespec ts, utc;
526 utc = CURRENT_TIME; 526 utc = CURRENT_TIME;
527 ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date), 527 ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
528 le16_to_cpu(rsp->SrvTime.Time)); 528 rsp->SrvTime.Time, 0);
529 cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d", 529 cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
530 (int)ts.tv_sec, (int)utc.tv_sec, 530 (int)ts.tv_sec, (int)utc.tv_sec,
531 (int)(utc.tv_sec - ts.tv_sec))); 531 (int)(utc.tv_sec - ts.tv_sec)));
@@ -2427,8 +2427,7 @@ querySymLinkRetry:
2427 params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; 2427 params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ;
2428 pSMB->TotalDataCount = 0; 2428 pSMB->TotalDataCount = 0;
2429 pSMB->MaxParameterCount = cpu_to_le16(2); 2429 pSMB->MaxParameterCount = cpu_to_le16(2);
2430 /* BB find exact max data count below from sess structure BB */ 2430 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
2431 pSMB->MaxDataCount = cpu_to_le16(4000);
2432 pSMB->MaxSetupCount = 0; 2431 pSMB->MaxSetupCount = 0;
2433 pSMB->Reserved = 0; 2432 pSMB->Reserved = 0;
2434 pSMB->Flags = 0; 2433 pSMB->Flags = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4aa81a507b74..97f4311b9a8e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
35#include <linux/namei.h> 35#include <linux/namei.h>
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include <asm/processor.h> 37#include <asm/processor.h>
38#include <linux/inet.h>
38#include <net/ipv6.h> 39#include <net/ipv6.h>
39#include "cifspdu.h" 40#include "cifspdu.h"
40#include "cifsglob.h" 41#include "cifsglob.h"
@@ -61,7 +62,6 @@ struct smb_vol {
61 char *domainname; 62 char *domainname;
62 char *UNC; 63 char *UNC;
63 char *UNCip; 64 char *UNCip;
64 char *in6_addr; /* ipv6 address as human readable form of in6_addr */
65 char *iocharset; /* local code page for mapping to and from Unicode */ 65 char *iocharset; /* local code page for mapping to and from Unicode */
66 char source_rfc1001_name[16]; /* netbios name of client */ 66 char source_rfc1001_name[16]; /* netbios name of client */
67 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ 67 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -827,14 +827,16 @@ cifs_parse_mount_options(char *options, const char *devname,
827 vol->target_rfc1001_name[0] = 0; 827 vol->target_rfc1001_name[0] = 0;
828 vol->linux_uid = current_uid(); /* use current_euid() instead? */ 828 vol->linux_uid = current_uid(); /* use current_euid() instead? */
829 vol->linux_gid = current_gid(); 829 vol->linux_gid = current_gid();
830 vol->dir_mode = S_IRWXUGO; 830
831 /* 2767 perms indicate mandatory locking support */ 831 /* default to only allowing write access to owner of the mount */
832 vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP); 832 vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
833 833
834 /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */ 834 /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
835 vol->rw = true; 835 vol->rw = true;
836 /* default is always to request posix paths. */ 836 /* default is always to request posix paths. */
837 vol->posix_paths = 1; 837 vol->posix_paths = 1;
838 /* default to using server inode numbers where available */
839 vol->server_ino = 1;
838 840
839 if (!options) 841 if (!options)
840 return 1; 842 return 1;
@@ -955,10 +957,12 @@ cifs_parse_mount_options(char *options, const char *devname,
955 } 957 }
956 strcpy(vol->password, value); 958 strcpy(vol->password, value);
957 } 959 }
958 } else if (strnicmp(data, "ip", 2) == 0) { 960 } else if (!strnicmp(data, "ip", 2) ||
961 !strnicmp(data, "addr", 4)) {
959 if (!value || !*value) { 962 if (!value || !*value) {
960 vol->UNCip = NULL; 963 vol->UNCip = NULL;
961 } else if (strnlen(value, 35) < 35) { 964 } else if (strnlen(value, INET6_ADDRSTRLEN) <
965 INET6_ADDRSTRLEN) {
962 vol->UNCip = value; 966 vol->UNCip = value;
963 } else { 967 } else {
964 printk(KERN_WARNING "CIFS: ip address " 968 printk(KERN_WARNING "CIFS: ip address "
@@ -1092,17 +1096,17 @@ cifs_parse_mount_options(char *options, const char *devname,
1092 return 1; 1096 return 1;
1093 } 1097 }
1094 } else if (strnicmp(data, "uid", 3) == 0) { 1098 } else if (strnicmp(data, "uid", 3) == 0) {
1095 if (value && *value) { 1099 if (value && *value)
1096 vol->linux_uid = 1100 vol->linux_uid =
1097 simple_strtoul(value, &value, 0); 1101 simple_strtoul(value, &value, 0);
1102 } else if (strnicmp(data, "forceuid", 8) == 0) {
1098 vol->override_uid = 1; 1103 vol->override_uid = 1;
1099 }
1100 } else if (strnicmp(data, "gid", 3) == 0) { 1104 } else if (strnicmp(data, "gid", 3) == 0) {
1101 if (value && *value) { 1105 if (value && *value)
1102 vol->linux_gid = 1106 vol->linux_gid =
1103 simple_strtoul(value, &value, 0); 1107 simple_strtoul(value, &value, 0);
1108 } else if (strnicmp(data, "forcegid", 8) == 0) {
1104 vol->override_gid = 1; 1109 vol->override_gid = 1;
1105 }
1106 } else if (strnicmp(data, "file_mode", 4) == 0) { 1110 } else if (strnicmp(data, "file_mode", 4) == 0) {
1107 if (value && *value) { 1111 if (value && *value) {
1108 vol->file_mode = 1112 vol->file_mode =
@@ -1315,16 +1319,6 @@ cifs_parse_mount_options(char *options, const char *devname,
1315 vol->direct_io = 1; 1319 vol->direct_io = 1;
1316 } else if (strnicmp(data, "forcedirectio", 13) == 0) { 1320 } else if (strnicmp(data, "forcedirectio", 13) == 0) {
1317 vol->direct_io = 1; 1321 vol->direct_io = 1;
1318 } else if (strnicmp(data, "in6_addr", 8) == 0) {
1319 if (!value || !*value) {
1320 vol->in6_addr = NULL;
1321 } else if (strnlen(value, 49) == 48) {
1322 vol->in6_addr = value;
1323 } else {
1324 printk(KERN_WARNING "CIFS: ip v6 address not "
1325 "48 characters long\n");
1326 return 1;
1327 }
1328 } else if (strnicmp(data, "noac", 4) == 0) { 1322 } else if (strnicmp(data, "noac", 4) == 0) {
1329 printk(KERN_WARNING "CIFS: Mount option noac not " 1323 printk(KERN_WARNING "CIFS: Mount option noac not "
1330 "supported. Instead set " 1324 "supported. Instead set "
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 302ea15f02e6..06866841b97f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
241 /* BB need same check in cifs_create too? */ 241 /* BB need same check in cifs_create too? */
242 /* if not oplocked, invalidate inode pages if mtime or file 242 /* if not oplocked, invalidate inode pages if mtime or file
243 size changed */ 243 size changed */
244 temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime)); 244 temp = cifs_NTtimeToUnix(buf->LastWriteTime);
245 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && 245 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
246 (file->f_path.dentry->d_inode->i_size == 246 (file->f_path.dentry->d_inode->i_size ==
247 (loff_t)le64_to_cpu(buf->EndOfFile))) { 247 (loff_t)le64_to_cpu(buf->EndOfFile))) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c869a6dcba1..fad882b075ba 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -85,10 +85,10 @@ static void cifs_unix_info_to_inode(struct inode *inode,
85 __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes); 85 __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
86 __u64 end_of_file = le64_to_cpu(info->EndOfFile); 86 __u64 end_of_file = le64_to_cpu(info->EndOfFile);
87 87
88 inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime)); 88 inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime);
89 inode->i_mtime = 89 inode->i_mtime =
90 cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime)); 90 cifs_NTtimeToUnix(info->LastModificationTime);
91 inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange)); 91 inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
92 inode->i_mode = le64_to_cpu(info->Permissions); 92 inode->i_mode = le64_to_cpu(info->Permissions);
93 93
94 /* 94 /*
@@ -554,14 +554,11 @@ int cifs_get_inode_info(struct inode **pinode,
554 554
555 /* Linux can not store file creation time so ignore it */ 555 /* Linux can not store file creation time so ignore it */
556 if (pfindData->LastAccessTime) 556 if (pfindData->LastAccessTime)
557 inode->i_atime = cifs_NTtimeToUnix 557 inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime);
558 (le64_to_cpu(pfindData->LastAccessTime));
559 else /* do not need to use current_fs_time - time not stored */ 558 else /* do not need to use current_fs_time - time not stored */
560 inode->i_atime = CURRENT_TIME; 559 inode->i_atime = CURRENT_TIME;
561 inode->i_mtime = 560 inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime);
562 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime)); 561 inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime);
563 inode->i_ctime =
564 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
565 cFYI(DBG2, ("Attributes came in as 0x%x", attr)); 562 cFYI(DBG2, ("Attributes came in as 0x%x", attr));
566 if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) { 563 if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
567 inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj; 564 inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
@@ -629,7 +626,7 @@ int cifs_get_inode_info(struct inode **pinode,
629 /* fill in 0777 bits from ACL */ 626 /* fill in 0777 bits from ACL */
630 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 627 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
631 cFYI(1, ("Getting mode bits from ACL")); 628 cFYI(1, ("Getting mode bits from ACL"));
632 acl_to_uid_mode(inode, full_path, pfid); 629 acl_to_uid_mode(cifs_sb, inode, full_path, pfid);
633 } 630 }
634#endif 631#endif
635 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { 632 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
@@ -699,7 +696,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
699} 696}
700 697
701/* gets root inode */ 698/* gets root inode */
702struct inode *cifs_iget(struct super_block *sb, unsigned long ino) 699struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
703{ 700{
704 int xid; 701 int xid;
705 struct cifs_sb_info *cifs_sb; 702 struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index e2fe998989a3..32d6baa0a54f 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -853,12 +853,12 @@ smbCalcSize_LE(struct smb_hdr *ptr)
853 853
854#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000) 854#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
855 855
856 /* 856/*
857 * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) 857 * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
858 * into Unix UTC (based 1970-01-01, in seconds). 858 * into Unix UTC (based 1970-01-01, in seconds).
859 */ 859 */
860struct timespec 860struct timespec
861cifs_NTtimeToUnix(u64 ntutc) 861cifs_NTtimeToUnix(__le64 ntutc)
862{ 862{
863 struct timespec ts; 863 struct timespec ts;
864 /* BB what about the timezone? BB */ 864 /* BB what about the timezone? BB */
@@ -866,7 +866,7 @@ cifs_NTtimeToUnix(u64 ntutc)
866 /* Subtract the NTFS time offset, then convert to 1s intervals. */ 866 /* Subtract the NTFS time offset, then convert to 1s intervals. */
867 u64 t; 867 u64 t;
868 868
869 t = ntutc - NTFS_TIME_OFFSET; 869 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
870 ts.tv_nsec = do_div(t, 10000000) * 100; 870 ts.tv_nsec = do_div(t, 10000000) * 100;
871 ts.tv_sec = t; 871 ts.tv_sec = t;
872 return ts; 872 return ts;
@@ -883,16 +883,12 @@ cifs_UnixTimeToNT(struct timespec t)
883static int total_days_of_prev_months[] = 883static int total_days_of_prev_months[] =
884{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; 884{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
885 885
886 886struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
887__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
888{
889 return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
890}
891
892struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
893{ 887{
894 struct timespec ts; 888 struct timespec ts;
895 int sec, min, days, month, year; 889 int sec, min, days, month, year;
890 u16 date = le16_to_cpu(le_date);
891 u16 time = le16_to_cpu(le_time);
896 SMB_TIME *st = (SMB_TIME *)&time; 892 SMB_TIME *st = (SMB_TIME *)&time;
897 SMB_DATE *sd = (SMB_DATE *)&date; 893 SMB_DATE *sd = (SMB_DATE *)&date;
898 894
@@ -933,7 +929,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
933 days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0); 929 days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
934 sec += 24 * 60 * 60 * days; 930 sec += 24 * 60 * 60 * days;
935 931
936 ts.tv_sec = sec; 932 ts.tv_sec = sec + offset;
937 933
938 /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */ 934 /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
939 935
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 964e097c8203..86d0055dc529 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -115,17 +115,6 @@ construct_dentry(struct qstr *qstring, struct file *file,
115 return rc; 115 return rc;
116} 116}
117 117
118static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode)
119{
120 if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
121 inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
122 inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
123 inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
124 }
125 return;
126}
127
128
129static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, 118static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
130 char *buf, unsigned int *pobject_type, int isNewInode) 119 char *buf, unsigned int *pobject_type, int isNewInode)
131{ 120{
@@ -150,26 +139,25 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
150 allocation_size = le64_to_cpu(pfindData->AllocationSize); 139 allocation_size = le64_to_cpu(pfindData->AllocationSize);
151 end_of_file = le64_to_cpu(pfindData->EndOfFile); 140 end_of_file = le64_to_cpu(pfindData->EndOfFile);
152 tmp_inode->i_atime = 141 tmp_inode->i_atime =
153 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime)); 142 cifs_NTtimeToUnix(pfindData->LastAccessTime);
154 tmp_inode->i_mtime = 143 tmp_inode->i_mtime =
155 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime)); 144 cifs_NTtimeToUnix(pfindData->LastWriteTime);
156 tmp_inode->i_ctime = 145 tmp_inode->i_ctime =
157 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime)); 146 cifs_NTtimeToUnix(pfindData->ChangeTime);
158 } else { /* legacy, OS2 and DOS style */ 147 } else { /* legacy, OS2 and DOS style */
159/* struct timespec ts;*/ 148 int offset = cifs_sb->tcon->ses->server->timeAdj;
160 FIND_FILE_STANDARD_INFO *pfindData = 149 FIND_FILE_STANDARD_INFO *pfindData =
161 (FIND_FILE_STANDARD_INFO *)buf; 150 (FIND_FILE_STANDARD_INFO *)buf;
162 151
163 tmp_inode->i_mtime = cnvrtDosUnixTm( 152 tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate,
164 le16_to_cpu(pfindData->LastWriteDate), 153 pfindData->LastWriteTime,
165 le16_to_cpu(pfindData->LastWriteTime)); 154 offset);
166 tmp_inode->i_atime = cnvrtDosUnixTm( 155 tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate,
167 le16_to_cpu(pfindData->LastAccessDate), 156 pfindData->LastAccessTime,
168 le16_to_cpu(pfindData->LastAccessTime)); 157 offset);
169 tmp_inode->i_ctime = cnvrtDosUnixTm( 158 tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate,
170 le16_to_cpu(pfindData->LastWriteDate), 159 pfindData->LastWriteTime,
171 le16_to_cpu(pfindData->LastWriteTime)); 160 offset);
172 AdjustForTZ(cifs_sb->tcon, tmp_inode);
173 attr = le16_to_cpu(pfindData->Attributes); 161 attr = le16_to_cpu(pfindData->Attributes);
174 allocation_size = le32_to_cpu(pfindData->AllocationSize); 162 allocation_size = le32_to_cpu(pfindData->AllocationSize);
175 end_of_file = le32_to_cpu(pfindData->DataSize); 163 end_of_file = le32_to_cpu(pfindData->DataSize);
@@ -331,11 +319,11 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
331 local_size = tmp_inode->i_size; 319 local_size = tmp_inode->i_size;
332 320
333 tmp_inode->i_atime = 321 tmp_inode->i_atime =
334 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime)); 322 cifs_NTtimeToUnix(pfindData->LastAccessTime);
335 tmp_inode->i_mtime = 323 tmp_inode->i_mtime =
336 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime)); 324 cifs_NTtimeToUnix(pfindData->LastModificationTime);
337 tmp_inode->i_ctime = 325 tmp_inode->i_ctime =
338 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange)); 326 cifs_NTtimeToUnix(pfindData->LastStatusChange);
339 327
340 tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions); 328 tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
341 /* since we set the inode type below we need to mask off type 329 /* since we set the inode type below we need to mask off type
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6a347fbc998a..ffd42815fda1 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
47 struct pipe_inode_info *pipe, size_t count, 47 struct pipe_inode_info *pipe, size_t count,
48 unsigned int flags) 48 unsigned int flags)
49{ 49{
50 ssize_t (*splice_read)(struct file *, loff_t *,
51 struct pipe_inode_info *, size_t, unsigned int);
50 struct coda_file_info *cfi; 52 struct coda_file_info *cfi;
51 struct file *host_file; 53 struct file *host_file;
52 54
@@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
54 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 56 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
55 host_file = cfi->cfi_container; 57 host_file = cfi->cfi_container;
56 58
57 if (!host_file->f_op || !host_file->f_op->splice_read) 59 splice_read = host_file->f_op->splice_read;
58 return -EINVAL; 60 if (!splice_read)
61 splice_read = default_file_splice_read;
59 62
60 return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags); 63 return splice_read(host_file, ppos, pipe, count, flags);
61} 64}
62 65
63static ssize_t 66static ssize_t
diff --git a/fs/compat.c b/fs/compat.c
index 681ed81e6be0..6aefb776dfeb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -812,10 +812,8 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
812 } 812 }
813 } 813 }
814 814
815 lock_kernel();
816 retval = do_mount((char*)dev_page, dir_page, (char*)type_page, 815 retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
817 flags, (void*)data_page); 816 flags, (void*)data_page);
818 unlock_kernel();
819 817
820 out4: 818 out4:
821 free_page(data_page); 819 free_page(data_page);
@@ -1488,7 +1486,7 @@ int compat_do_execve(char * filename,
1488 if (!bprm) 1486 if (!bprm)
1489 goto out_files; 1487 goto out_files;
1490 1488
1491 retval = mutex_lock_interruptible(&current->cred_exec_mutex); 1489 retval = mutex_lock_interruptible(&current->cred_guard_mutex);
1492 if (retval < 0) 1490 if (retval < 0)
1493 goto out_free; 1491 goto out_free;
1494 current->in_execve = 1; 1492 current->in_execve = 1;
@@ -1550,7 +1548,7 @@ int compat_do_execve(char * filename,
1550 /* execve succeeded */ 1548 /* execve succeeded */
1551 current->fs->in_exec = 0; 1549 current->fs->in_exec = 0;
1552 current->in_execve = 0; 1550 current->in_execve = 0;
1553 mutex_unlock(&current->cred_exec_mutex); 1551 mutex_unlock(&current->cred_guard_mutex);
1554 acct_update_integrals(current); 1552 acct_update_integrals(current);
1555 free_bprm(bprm); 1553 free_bprm(bprm);
1556 if (displaced) 1554 if (displaced)
@@ -1573,7 +1571,7 @@ out_unmark:
1573 1571
1574out_unlock: 1572out_unlock:
1575 current->in_execve = 0; 1573 current->in_execve = 0;
1576 mutex_unlock(&current->cred_exec_mutex); 1574 mutex_unlock(&current->cred_guard_mutex);
1577 1575
1578out_free: 1576out_free:
1579 free_bprm(bprm); 1577 free_bprm(bprm);
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 762d287123ca..da6061a6df40 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -39,6 +39,9 @@ struct configfs_dirent {
39 umode_t s_mode; 39 umode_t s_mode;
40 struct dentry * s_dentry; 40 struct dentry * s_dentry;
41 struct iattr * s_iattr; 41 struct iattr * s_iattr;
42#ifdef CONFIG_LOCKDEP
43 int s_depth;
44#endif
42}; 45};
43 46
44#define CONFIGFS_ROOT 0x0001 47#define CONFIGFS_ROOT 0x0001
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 05373db21a4e..8e48b52205aa 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -78,11 +78,97 @@ static const struct dentry_operations configfs_dentry_ops = {
78 .d_delete = configfs_d_delete, 78 .d_delete = configfs_d_delete,
79}; 79};
80 80
81#ifdef CONFIG_LOCKDEP
82
83/*
84 * Helpers to make lockdep happy with our recursive locking of default groups'
85 * inodes (see configfs_attach_group() and configfs_detach_group()).
86 * We put default groups i_mutexes in separate classes according to their depth
87 * from the youngest non-default group ancestor.
88 *
89 * For a non-default group A having default groups A/B, A/C, and A/C/D, default
90 * groups A/B and A/C will have their inode's mutex in class
91 * default_group_class[0], and default group A/C/D will be in
92 * default_group_class[1].
93 *
94 * The lock classes are declared and assigned in inode.c, according to the
95 * s_depth value.
96 * The s_depth value is initialized to -1, adjusted to >= 0 when attaching
97 * default groups, and reset to -1 when all default groups are attached. During
98 * attachment, if configfs_create() sees s_depth > 0, the lock class of the new
99 * inode's mutex is set to default_group_class[s_depth - 1].
100 */
101
102static void configfs_init_dirent_depth(struct configfs_dirent *sd)
103{
104 sd->s_depth = -1;
105}
106
107static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
108 struct configfs_dirent *sd)
109{
110 int parent_depth = parent_sd->s_depth;
111
112 if (parent_depth >= 0)
113 sd->s_depth = parent_depth + 1;
114}
115
116static void
117configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
118{
119 /*
120 * item's i_mutex class is already setup, so s_depth is now only
121 * used to set new sub-directories s_depth, which is always done
122 * with item's i_mutex locked.
123 */
124 /*
125 * sd->s_depth == -1 iff we are a non default group.
126 * else (we are a default group) sd->s_depth > 0 (see
127 * create_dir()).
128 */
129 if (sd->s_depth == -1)
130 /*
131 * We are a non default group and we are going to create
132 * default groups.
133 */
134 sd->s_depth = 0;
135}
136
137static void
138configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
139{
140 /* We will not create default groups anymore. */
141 sd->s_depth = -1;
142}
143
144#else /* CONFIG_LOCKDEP */
145
146static void configfs_init_dirent_depth(struct configfs_dirent *sd)
147{
148}
149
150static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
151 struct configfs_dirent *sd)
152{
153}
154
155static void
156configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
157{
158}
159
160static void
161configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
162{
163}
164
165#endif /* CONFIG_LOCKDEP */
166
81/* 167/*
82 * Allocates a new configfs_dirent and links it to the parent configfs_dirent 168 * Allocates a new configfs_dirent and links it to the parent configfs_dirent
83 */ 169 */
84static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd, 170static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
85 void * element) 171 void *element, int type)
86{ 172{
87 struct configfs_dirent * sd; 173 struct configfs_dirent * sd;
88 174
@@ -94,6 +180,8 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
94 INIT_LIST_HEAD(&sd->s_links); 180 INIT_LIST_HEAD(&sd->s_links);
95 INIT_LIST_HEAD(&sd->s_children); 181 INIT_LIST_HEAD(&sd->s_children);
96 sd->s_element = element; 182 sd->s_element = element;
183 sd->s_type = type;
184 configfs_init_dirent_depth(sd);
97 spin_lock(&configfs_dirent_lock); 185 spin_lock(&configfs_dirent_lock);
98 if (parent_sd->s_type & CONFIGFS_USET_DROPPING) { 186 if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
99 spin_unlock(&configfs_dirent_lock); 187 spin_unlock(&configfs_dirent_lock);
@@ -138,12 +226,11 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
138{ 226{
139 struct configfs_dirent * sd; 227 struct configfs_dirent * sd;
140 228
141 sd = configfs_new_dirent(parent_sd, element); 229 sd = configfs_new_dirent(parent_sd, element, type);
142 if (IS_ERR(sd)) 230 if (IS_ERR(sd))
143 return PTR_ERR(sd); 231 return PTR_ERR(sd);
144 232
145 sd->s_mode = mode; 233 sd->s_mode = mode;
146 sd->s_type = type;
147 sd->s_dentry = dentry; 234 sd->s_dentry = dentry;
148 if (dentry) { 235 if (dentry) {
149 dentry->d_fsdata = configfs_get(sd); 236 dentry->d_fsdata = configfs_get(sd);
@@ -187,6 +274,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
187 error = configfs_make_dirent(p->d_fsdata, d, k, mode, 274 error = configfs_make_dirent(p->d_fsdata, d, k, mode,
188 CONFIGFS_DIR | CONFIGFS_USET_CREATING); 275 CONFIGFS_DIR | CONFIGFS_USET_CREATING);
189 if (!error) { 276 if (!error) {
277 configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);
190 error = configfs_create(d, mode, init_dir); 278 error = configfs_create(d, mode, init_dir);
191 if (!error) { 279 if (!error) {
192 inc_nlink(p->d_inode); 280 inc_nlink(p->d_inode);
@@ -789,11 +877,13 @@ static int configfs_attach_group(struct config_item *parent_item,
789 * error, as rmdir() would. 877 * error, as rmdir() would.
790 */ 878 */
791 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 879 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
880 configfs_adjust_dir_dirent_depth_before_populate(sd);
792 ret = populate_groups(to_config_group(item)); 881 ret = populate_groups(to_config_group(item));
793 if (ret) { 882 if (ret) {
794 configfs_detach_item(item); 883 configfs_detach_item(item);
795 dentry->d_inode->i_flags |= S_DEAD; 884 dentry->d_inode->i_flags |= S_DEAD;
796 } 885 }
886 configfs_adjust_dir_dirent_depth_after_populate(sd);
797 mutex_unlock(&dentry->d_inode->i_mutex); 887 mutex_unlock(&dentry->d_inode->i_mutex);
798 if (ret) 888 if (ret)
799 d_delete(dentry); 889 d_delete(dentry);
@@ -916,11 +1006,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
916 * Note, btw, that this can be called at *any* time, even when a configfs 1006 * Note, btw, that this can be called at *any* time, even when a configfs
917 * subsystem isn't registered, or when configfs is loading or unloading. 1007 * subsystem isn't registered, or when configfs is loading or unloading.
918 * Just like configfs_register_subsystem(). So we take the same 1008 * Just like configfs_register_subsystem(). So we take the same
919 * precautions. We pin the filesystem. We lock each i_mutex _in_order_ 1009 * precautions. We pin the filesystem. We lock configfs_dirent_lock.
920 * on our way down the tree. If we can find the target item in the 1010 * If we can find the target item in the
921 * configfs tree, it must be part of the subsystem tree as well, so we 1011 * configfs tree, it must be part of the subsystem tree as well, so we
922 * do not need the subsystem semaphore. Holding the i_mutex chain locks 1012 * do not need the subsystem semaphore. Holding configfs_dirent_lock helps
923 * out mkdir() and rmdir(), who might be racing us. 1013 * locking out mkdir() and rmdir(), who might be racing us.
924 */ 1014 */
925 1015
926/* 1016/*
@@ -933,17 +1023,21 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
933 * do that so we can unlock it if we find nothing. 1023 * do that so we can unlock it if we find nothing.
934 * 1024 *
935 * Here we do a depth-first search of the dentry hierarchy looking for 1025 * Here we do a depth-first search of the dentry hierarchy looking for
936 * our object. We take i_mutex on each step of the way down. IT IS 1026 * our object.
937 * ESSENTIAL THAT i_mutex LOCKING IS ORDERED. If we come back up a branch, 1027 * We deliberately ignore items tagged as dropping since they are virtually
938 * we'll drop the i_mutex. 1028 * dead, as well as items in the middle of attachment since they virtually
1029 * do not exist yet. This completes the locking out of racing mkdir() and
1030 * rmdir().
1031 * Note: subdirectories in the middle of attachment start with s_type =
1032 * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir(). When
1033 * CONFIGFS_USET_CREATING is set, we ignore the item. The actual set of
1034 * s_type is in configfs_new_dirent(), which has configfs_dirent_lock.
939 * 1035 *
940 * If the target is not found, -ENOENT is bubbled up and we have released 1036 * If the target is not found, -ENOENT is bubbled up.
941 * all locks. If the target was found, the locks will be cleared by
942 * configfs_depend_rollback().
943 * 1037 *
944 * This adds a requirement that all config_items be unique! 1038 * This adds a requirement that all config_items be unique!
945 * 1039 *
946 * This is recursive because the locking traversal is tricky. There isn't 1040 * This is recursive. There isn't
947 * much on the stack, though, so folks that need this function - be careful 1041 * much on the stack, though, so folks that need this function - be careful
948 * about your stack! Patches will be accepted to make it iterative. 1042 * about your stack! Patches will be accepted to make it iterative.
949 */ 1043 */
@@ -955,13 +1049,13 @@ static int configfs_depend_prep(struct dentry *origin,
955 1049
956 BUG_ON(!origin || !sd); 1050 BUG_ON(!origin || !sd);
957 1051
958 /* Lock this guy on the way down */
959 mutex_lock(&sd->s_dentry->d_inode->i_mutex);
960 if (sd->s_element == target) /* Boo-yah */ 1052 if (sd->s_element == target) /* Boo-yah */
961 goto out; 1053 goto out;
962 1054
963 list_for_each_entry(child_sd, &sd->s_children, s_sibling) { 1055 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
964 if (child_sd->s_type & CONFIGFS_DIR) { 1056 if ((child_sd->s_type & CONFIGFS_DIR) &&
1057 !(child_sd->s_type & CONFIGFS_USET_DROPPING) &&
1058 !(child_sd->s_type & CONFIGFS_USET_CREATING)) {
965 ret = configfs_depend_prep(child_sd->s_dentry, 1059 ret = configfs_depend_prep(child_sd->s_dentry,
966 target); 1060 target);
967 if (!ret) 1061 if (!ret)
@@ -970,33 +1064,12 @@ static int configfs_depend_prep(struct dentry *origin,
970 } 1064 }
971 1065
972 /* We looped all our children and didn't find target */ 1066 /* We looped all our children and didn't find target */
973 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
974 ret = -ENOENT; 1067 ret = -ENOENT;
975 1068
976out: 1069out:
977 return ret; 1070 return ret;
978} 1071}
979 1072
980/*
981 * This is ONLY called if configfs_depend_prep() did its job. So we can
982 * trust the entire path from item back up to origin.
983 *
984 * We walk backwards from item, unlocking each i_mutex. We finish by
985 * unlocking origin.
986 */
987static void configfs_depend_rollback(struct dentry *origin,
988 struct config_item *item)
989{
990 struct dentry *dentry = item->ci_dentry;
991
992 while (dentry != origin) {
993 mutex_unlock(&dentry->d_inode->i_mutex);
994 dentry = dentry->d_parent;
995 }
996
997 mutex_unlock(&origin->d_inode->i_mutex);
998}
999
1000int configfs_depend_item(struct configfs_subsystem *subsys, 1073int configfs_depend_item(struct configfs_subsystem *subsys,
1001 struct config_item *target) 1074 struct config_item *target)
1002{ 1075{
@@ -1037,17 +1110,21 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
1037 1110
1038 /* Ok, now we can trust subsys/s_item */ 1111 /* Ok, now we can trust subsys/s_item */
1039 1112
1040 /* Scan the tree, locking i_mutex recursively, return 0 if found */ 1113 spin_lock(&configfs_dirent_lock);
1114 /* Scan the tree, return 0 if found */
1041 ret = configfs_depend_prep(subsys_sd->s_dentry, target); 1115 ret = configfs_depend_prep(subsys_sd->s_dentry, target);
1042 if (ret) 1116 if (ret)
1043 goto out_unlock_fs; 1117 goto out_unlock_dirent_lock;
1044 1118
1045 /* We hold all i_mutexes from the subsystem down to the target */ 1119 /*
1120 * We are sure that the item is not about to be removed by rmdir(), and
1121 * not in the middle of attachment by mkdir().
1122 */
1046 p = target->ci_dentry->d_fsdata; 1123 p = target->ci_dentry->d_fsdata;
1047 p->s_dependent_count += 1; 1124 p->s_dependent_count += 1;
1048 1125
1049 configfs_depend_rollback(subsys_sd->s_dentry, target); 1126out_unlock_dirent_lock:
1050 1127 spin_unlock(&configfs_dirent_lock);
1051out_unlock_fs: 1128out_unlock_fs:
1052 mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); 1129 mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
1053 1130
@@ -1072,10 +1149,10 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
1072 struct configfs_dirent *sd; 1149 struct configfs_dirent *sd;
1073 1150
1074 /* 1151 /*
1075 * Since we can trust everything is pinned, we just need i_mutex 1152 * Since we can trust everything is pinned, we just need
1076 * on the item. 1153 * configfs_dirent_lock.
1077 */ 1154 */
1078 mutex_lock(&target->ci_dentry->d_inode->i_mutex); 1155 spin_lock(&configfs_dirent_lock);
1079 1156
1080 sd = target->ci_dentry->d_fsdata; 1157 sd = target->ci_dentry->d_fsdata;
1081 BUG_ON(sd->s_dependent_count < 1); 1158 BUG_ON(sd->s_dependent_count < 1);
@@ -1086,7 +1163,7 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
1086 * After this unlock, we cannot trust the item to stay alive! 1163 * After this unlock, we cannot trust the item to stay alive!
1087 * DO NOT REFERENCE item after this unlock. 1164 * DO NOT REFERENCE item after this unlock.
1088 */ 1165 */
1089 mutex_unlock(&target->ci_dentry->d_inode->i_mutex); 1166 spin_unlock(&configfs_dirent_lock);
1090} 1167}
1091EXPORT_SYMBOL(configfs_undepend_item); 1168EXPORT_SYMBOL(configfs_undepend_item);
1092 1169
@@ -1286,13 +1363,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1286 if (sd->s_type & CONFIGFS_USET_DEFAULT) 1363 if (sd->s_type & CONFIGFS_USET_DEFAULT)
1287 return -EPERM; 1364 return -EPERM;
1288 1365
1289 /*
1290 * Here's where we check for dependents. We're protected by
1291 * i_mutex.
1292 */
1293 if (sd->s_dependent_count)
1294 return -EBUSY;
1295
1296 /* Get a working ref until we have the child */ 1366 /* Get a working ref until we have the child */
1297 parent_item = configfs_get_config_item(dentry->d_parent); 1367 parent_item = configfs_get_config_item(dentry->d_parent);
1298 subsys = to_config_group(parent_item)->cg_subsys; 1368 subsys = to_config_group(parent_item)->cg_subsys;
@@ -1316,9 +1386,17 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1316 1386
1317 mutex_lock(&configfs_symlink_mutex); 1387 mutex_lock(&configfs_symlink_mutex);
1318 spin_lock(&configfs_dirent_lock); 1388 spin_lock(&configfs_dirent_lock);
1319 ret = configfs_detach_prep(dentry, &wait_mutex); 1389 /*
1320 if (ret) 1390 * Here's where we check for dependents. We're protected by
1321 configfs_detach_rollback(dentry); 1391 * configfs_dirent_lock.
1392 * If no dependent, atomically tag the item as dropping.
1393 */
1394 ret = sd->s_dependent_count ? -EBUSY : 0;
1395 if (!ret) {
1396 ret = configfs_detach_prep(dentry, &wait_mutex);
1397 if (ret)
1398 configfs_detach_rollback(dentry);
1399 }
1322 spin_unlock(&configfs_dirent_lock); 1400 spin_unlock(&configfs_dirent_lock);
1323 mutex_unlock(&configfs_symlink_mutex); 1401 mutex_unlock(&configfs_symlink_mutex);
1324 1402
@@ -1429,7 +1507,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
1429 */ 1507 */
1430 err = -ENOENT; 1508 err = -ENOENT;
1431 if (configfs_dirent_is_ready(parent_sd)) { 1509 if (configfs_dirent_is_ready(parent_sd)) {
1432 file->private_data = configfs_new_dirent(parent_sd, NULL); 1510 file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
1433 if (IS_ERR(file->private_data)) 1511 if (IS_ERR(file->private_data))
1434 err = PTR_ERR(file->private_data); 1512 err = PTR_ERR(file->private_data);
1435 else 1513 else
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5d349d38e056..4921e7426d95 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -33,10 +33,15 @@
33#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
34#include <linux/capability.h> 34#include <linux/capability.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/lockdep.h>
36 37
37#include <linux/configfs.h> 38#include <linux/configfs.h>
38#include "configfs_internal.h" 39#include "configfs_internal.h"
39 40
41#ifdef CONFIG_LOCKDEP
42static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
43#endif
44
40extern struct super_block * configfs_sb; 45extern struct super_block * configfs_sb;
41 46
42static const struct address_space_operations configfs_aops = { 47static const struct address_space_operations configfs_aops = {
@@ -150,6 +155,38 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
150 return inode; 155 return inode;
151} 156}
152 157
158#ifdef CONFIG_LOCKDEP
159
160static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
161 struct inode *inode)
162{
163 int depth = sd->s_depth;
164
165 if (depth > 0) {
166 if (depth <= ARRAY_SIZE(default_group_class)) {
167 lockdep_set_class(&inode->i_mutex,
168 &default_group_class[depth - 1]);
169 } else {
170 /*
171 * In practice the maximum level of locking depth is
172 * already reached. Just inform about possible reasons.
173 */
174 printk(KERN_INFO "configfs: Too many levels of inodes"
175 " for the locking correctness validator.\n");
176 printk(KERN_INFO "Spurious warnings may appear.\n");
177 }
178 }
179}
180
181#else /* CONFIG_LOCKDEP */
182
183static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
184 struct inode *inode)
185{
186}
187
188#endif /* CONFIG_LOCKDEP */
189
153int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) 190int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
154{ 191{
155 int error = 0; 192 int error = 0;
@@ -162,6 +199,7 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
162 struct inode *p_inode = dentry->d_parent->d_inode; 199 struct inode *p_inode = dentry->d_parent->d_inode;
163 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; 200 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
164 } 201 }
202 configfs_set_inode_lock_class(sd, inode);
165 goto Proceed; 203 goto Proceed;
166 } 204 }
167 else 205 else
diff --git a/fs/dcache.c b/fs/dcache.c
index 75659a6fd1f8..9e5cd3c3a6ba 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root,
1910 1910
1911 spin_lock(&vfsmount_lock); 1911 spin_lock(&vfsmount_lock);
1912 prepend(&end, &buflen, "\0", 1); 1912 prepend(&end, &buflen, "\0", 1);
1913 if (!IS_ROOT(dentry) && d_unhashed(dentry) && 1913 if (d_unlinked(dentry) &&
1914 (prepend(&end, &buflen, " (deleted)", 10) != 0)) 1914 (prepend(&end, &buflen, " (deleted)", 10) != 0))
1915 goto Elong; 1915 goto Elong;
1916 1916
@@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2035 2035
2036 spin_lock(&dcache_lock); 2036 spin_lock(&dcache_lock);
2037 prepend(&end, &buflen, "\0", 1); 2037 prepend(&end, &buflen, "\0", 1);
2038 if (!IS_ROOT(dentry) && d_unhashed(dentry) && 2038 if (d_unlinked(dentry) &&
2039 (prepend(&end, &buflen, "//deleted", 9) != 0)) 2039 (prepend(&end, &buflen, "//deleted", 9) != 0))
2040 goto Elong; 2040 goto Elong;
2041 if (buflen < 1) 2041 if (buflen < 1)
@@ -2097,9 +2097,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2097 read_unlock(&current->fs->lock); 2097 read_unlock(&current->fs->lock);
2098 2098
2099 error = -ENOENT; 2099 error = -ENOENT;
2100 /* Has the current directory has been unlinked? */
2101 spin_lock(&dcache_lock); 2100 spin_lock(&dcache_lock);
2102 if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) { 2101 if (!d_unlinked(pwd.dentry)) {
2103 unsigned long len; 2102 unsigned long len;
2104 struct path tmp = root; 2103 struct path tmp = root;
2105 char * cwd; 2104 char * cwd;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c68edb969441..9b1d285f9fe6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -557,8 +557,10 @@ static int __init init_devpts_fs(void)
557 int err = register_filesystem(&devpts_fs_type); 557 int err = register_filesystem(&devpts_fs_type);
558 if (!err) { 558 if (!err) {
559 devpts_mnt = kern_mount(&devpts_fs_type); 559 devpts_mnt = kern_mount(&devpts_fs_type);
560 if (IS_ERR(devpts_mnt)) 560 if (IS_ERR(devpts_mnt)) {
561 err = PTR_ERR(devpts_mnt); 561 err = PTR_ERR(devpts_mnt);
562 unregister_filesystem(&devpts_fs_type);
563 }
562 } 564 }
563 return err; 565 return err;
564} 566}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bbc2050..8b10b87dc01a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1127 rw = WRITE_ODIRECT; 1127 rw = WRITE_ODIRECT;
1128 1128
1129 if (bdev) 1129 if (bdev)
1130 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); 1130 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
1131 1131
1132 if (offset & blocksize_mask) { 1132 if (offset & blocksize_mask) {
1133 if (bdev) 1133 if (bdev)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 858fba14aaa6..c4dfa1dcc86f 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,8 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
49 spin_unlock(&ls->ls_recover_list_lock); 49 spin_unlock(&ls->ls_recover_list_lock);
50 50
51 if (!found) 51 if (!found)
52 de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL); 52 de = kzalloc(sizeof(struct dlm_direntry) + len,
53 ls->ls_allocation);
53 return de; 54 return de;
54} 55}
55 56
@@ -211,7 +212,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
211 212
212 dlm_dir_clear(ls); 213 dlm_dir_clear(ls);
213 214
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL); 215 last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
215 if (!last_name) 216 if (!last_name)
216 goto out; 217 goto out;
217 218
@@ -322,7 +323,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
322 if (namelen > DLM_RESNAME_MAXLEN) 323 if (namelen > DLM_RESNAME_MAXLEN)
323 return -EINVAL; 324 return -EINVAL;
324 325
325 de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL); 326 de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
326 if (!de) 327 if (!de)
327 return -ENOMEM; 328 return -ENOMEM;
328 329
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index cd8e2df3c295..d489fcc86713 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -384,7 +384,7 @@ static void threads_stop(void)
384 dlm_astd_stop(); 384 dlm_astd_stop();
385} 385}
386 386
387static int new_lockspace(char *name, int namelen, void **lockspace, 387static int new_lockspace(const char *name, int namelen, void **lockspace,
388 uint32_t flags, int lvblen) 388 uint32_t flags, int lvblen)
389{ 389{
390 struct dlm_ls *ls; 390 struct dlm_ls *ls;
@@ -419,16 +419,14 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
419 break; 419 break;
420 } 420 }
421 ls->ls_create_count++; 421 ls->ls_create_count++;
422 module_put(THIS_MODULE); 422 *lockspace = ls;
423 error = 1; /* not an error, return 0 */ 423 error = 1;
424 break; 424 break;
425 } 425 }
426 spin_unlock(&lslist_lock); 426 spin_unlock(&lslist_lock);
427 427
428 if (error < 0)
429 goto out;
430 if (error) 428 if (error)
431 goto ret_zero; 429 goto out;
432 430
433 error = -ENOMEM; 431 error = -ENOMEM;
434 432
@@ -583,7 +581,6 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
583 dlm_create_debug_file(ls); 581 dlm_create_debug_file(ls);
584 582
585 log_debug(ls, "join complete"); 583 log_debug(ls, "join complete");
586 ret_zero:
587 *lockspace = ls; 584 *lockspace = ls;
588 return 0; 585 return 0;
589 586
@@ -614,7 +611,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
614 return error; 611 return error;
615} 612}
616 613
617int dlm_new_lockspace(char *name, int namelen, void **lockspace, 614int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
618 uint32_t flags, int lvblen) 615 uint32_t flags, int lvblen)
619{ 616{
620 int error = 0; 617 int error = 0;
@@ -628,7 +625,9 @@ int dlm_new_lockspace(char *name, int namelen, void **lockspace,
628 error = new_lockspace(name, namelen, lockspace, flags, lvblen); 625 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
629 if (!error) 626 if (!error)
630 ls_count++; 627 ls_count++;
631 else if (!ls_count) 628 if (error > 0)
629 error = 0;
630 if (!ls_count)
632 threads_stop(); 631 threads_stop();
633 out: 632 out:
634 mutex_unlock(&ls_lock); 633 mutex_unlock(&ls_lock);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 609108a83267..cdb580a9c7a2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -309,6 +309,20 @@ static void lowcomms_state_change(struct sock *sk)
309 lowcomms_write_space(sk); 309 lowcomms_write_space(sk);
310} 310}
311 311
312int dlm_lowcomms_connect_node(int nodeid)
313{
314 struct connection *con;
315
316 if (nodeid == dlm_our_nodeid())
317 return 0;
318
319 con = nodeid2con(nodeid, GFP_NOFS);
320 if (!con)
321 return -ENOMEM;
322 lowcomms_connect_sock(con);
323 return 0;
324}
325
312/* Make a socket active */ 326/* Make a socket active */
313static int add_sock(struct socket *sock, struct connection *con) 327static int add_sock(struct socket *sock, struct connection *con)
314{ 328{
@@ -486,7 +500,7 @@ static void process_sctp_notification(struct connection *con,
486 return; 500 return;
487 } 501 }
488 502
489 new_con = nodeid2con(nodeid, GFP_KERNEL); 503 new_con = nodeid2con(nodeid, GFP_NOFS);
490 if (!new_con) 504 if (!new_con)
491 return; 505 return;
492 506
@@ -722,7 +736,7 @@ static int tcp_accept_from_sock(struct connection *con)
722 * the same time and the connections cross on the wire. 736 * the same time and the connections cross on the wire.
723 * In this case we store the incoming one in "othercon" 737 * In this case we store the incoming one in "othercon"
724 */ 738 */
725 newcon = nodeid2con(nodeid, GFP_KERNEL); 739 newcon = nodeid2con(nodeid, GFP_NOFS);
726 if (!newcon) { 740 if (!newcon) {
727 result = -ENOMEM; 741 result = -ENOMEM;
728 goto accept_err; 742 goto accept_err;
@@ -732,7 +746,7 @@ static int tcp_accept_from_sock(struct connection *con)
732 struct connection *othercon = newcon->othercon; 746 struct connection *othercon = newcon->othercon;
733 747
734 if (!othercon) { 748 if (!othercon) {
735 othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL); 749 othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
736 if (!othercon) { 750 if (!othercon) {
737 log_print("failed to allocate incoming socket"); 751 log_print("failed to allocate incoming socket");
738 mutex_unlock(&newcon->sock_mutex); 752 mutex_unlock(&newcon->sock_mutex);
@@ -1421,7 +1435,7 @@ static int work_start(void)
1421static void stop_conn(struct connection *con) 1435static void stop_conn(struct connection *con)
1422{ 1436{
1423 con->flags |= 0x0F; 1437 con->flags |= 0x0F;
1424 if (con->sock) 1438 if (con->sock && con->sock->sk)
1425 con->sock->sk->sk_user_data = NULL; 1439 con->sock->sk->sk_user_data = NULL;
1426} 1440}
1427 1441
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index a9a9618c0d3f..1311e6426287 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_lowcomms_stop(void);
19int dlm_lowcomms_close(int nodeid); 19int dlm_lowcomms_close(int nodeid);
20void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); 20void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
21void dlm_lowcomms_commit_buffer(void *mh); 21void dlm_lowcomms_commit_buffer(void *mh);
22int dlm_lowcomms_connect_node(int nodeid);
22 23
23#endif /* __LOWCOMMS_DOT_H__ */ 24#endif /* __LOWCOMMS_DOT_H__ */
24 25
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 26133f05ae3a..b128775913b2 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
17#include "recover.h" 17#include "recover.h"
18#include "rcom.h" 18#include "rcom.h"
19#include "config.h" 19#include "config.h"
20#include "lowcomms.h"
20 21
21static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) 22static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
22{ 23{
@@ -45,9 +46,9 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
45static int dlm_add_member(struct dlm_ls *ls, int nodeid) 46static int dlm_add_member(struct dlm_ls *ls, int nodeid)
46{ 47{
47 struct dlm_member *memb; 48 struct dlm_member *memb;
48 int w; 49 int w, error;
49 50
50 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL); 51 memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
51 if (!memb) 52 if (!memb)
52 return -ENOMEM; 53 return -ENOMEM;
53 54
@@ -57,6 +58,12 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
57 return w; 58 return w;
58 } 59 }
59 60
61 error = dlm_lowcomms_connect_node(nodeid);
62 if (error < 0) {
63 kfree(memb);
64 return error;
65 }
66
60 memb->nodeid = nodeid; 67 memb->nodeid = nodeid;
61 memb->weight = w; 68 memb->weight = w;
62 add_ordered_member(ls, memb); 69 add_ordered_member(ls, memb);
@@ -136,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
136 143
137 ls->ls_total_weight = total; 144 ls->ls_total_weight = total;
138 145
139 array = kmalloc(sizeof(int) * total, GFP_KERNEL); 146 array = kmalloc(sizeof(int) * total, ls->ls_allocation);
140 if (!array) 147 if (!array)
141 return; 148 return;
142 149
@@ -219,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
219 continue; 226 continue;
220 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); 227 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
221 228
222 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL); 229 memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
223 if (!memb) 230 if (!memb)
224 return -ENOMEM; 231 return -ENOMEM;
225 memb->nodeid = rv->new[i]; 232 memb->nodeid = rv->new[i];
@@ -334,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
334 int *ids = NULL, *new = NULL; 341 int *ids = NULL, *new = NULL;
335 int error, ids_count = 0, new_count = 0; 342 int error, ids_count = 0, new_count = 0;
336 343
337 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL); 344 rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
338 if (!rv) 345 if (!rv)
339 return -ENOMEM; 346 return -ENOMEM;
340 347
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index daa4183fbb84..7a2307c08911 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
35 struct rq_entry *e; 35 struct rq_entry *e;
36 int length = ms->m_header.h_length - sizeof(struct dlm_message); 36 int length = ms->m_header.h_length - sizeof(struct dlm_message);
37 37
38 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); 38 e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
39 if (!e) { 39 if (!e) {
40 log_print("dlm_add_requestqueue: out of memory len %d", length); 40 log_print("dlm_add_requestqueue: out of memory len %d", length);
41 return; 41 return;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index fa4c7e7d15d9..12d649602d3a 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -27,6 +27,7 @@
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/seq_file.h> 29#include <linux/seq_file.h>
30#include <linux/smp_lock.h>
30#include <linux/file.h> 31#include <linux/file.h>
31#include <linux/crypto.h> 32#include <linux/crypto.h>
32#include "ecryptfs_kernel.h" 33#include "ecryptfs_kernel.h"
@@ -120,9 +121,13 @@ static void ecryptfs_put_super(struct super_block *sb)
120{ 121{
121 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); 122 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
122 123
124 lock_kernel();
125
123 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); 126 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
124 kmem_cache_free(ecryptfs_sb_info_cache, sb_info); 127 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
125 ecryptfs_set_superblock_private(sb, NULL); 128 ecryptfs_set_superblock_private(sb, NULL);
129
130 unlock_kernel();
126} 131}
127 132
128/** 133/**
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 2a701d593d35..3f0e1974abdc 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,6 +16,7 @@
16#include <linux/anon_inodes.h> 16#include <linux/anon_inodes.h>
17#include <linux/eventfd.h> 17#include <linux/eventfd.h>
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/module.h>
19 20
20struct eventfd_ctx { 21struct eventfd_ctx {
21 wait_queue_head_t wqh; 22 wait_queue_head_t wqh;
@@ -56,6 +57,7 @@ int eventfd_signal(struct file *file, int n)
56 57
57 return n; 58 return n;
58} 59}
60EXPORT_SYMBOL_GPL(eventfd_signal);
59 61
60static int eventfd_release(struct inode *inode, struct file *file) 62static int eventfd_release(struct inode *inode, struct file *file)
61{ 63{
@@ -197,6 +199,7 @@ struct file *eventfd_fget(int fd)
197 199
198 return file; 200 return file;
199} 201}
202EXPORT_SYMBOL_GPL(eventfd_fget);
200 203
201SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 204SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
202{ 205{
diff --git a/fs/exec.c b/fs/exec.c
index 895823d0149d..e639957d7a57 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
922 task_lock(tsk); 923 task_lock(tsk);
923 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 924 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
924 task_unlock(tsk); 925 task_unlock(tsk);
926 perf_counter_comm(tsk);
925} 927}
926 928
927int flush_old_exec(struct linux_binprm * bprm) 929int flush_old_exec(struct linux_binprm * bprm)
@@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)
990 992
991 current->personality &= ~bprm->per_clear; 993 current->personality &= ~bprm->per_clear;
992 994
995 /*
996 * Flush performance counters when crossing a
997 * security domain:
998 */
999 if (!get_dumpable(current->mm))
1000 perf_counter_exit_task(current);
1001
993 /* An exec changes our domain. We are no longer part of the thread 1002 /* An exec changes our domain. We are no longer part of the thread
994 group */ 1003 group */
995 1004
@@ -1016,7 +1025,7 @@ void install_exec_creds(struct linux_binprm *bprm)
1016 commit_creds(bprm->cred); 1025 commit_creds(bprm->cred);
1017 bprm->cred = NULL; 1026 bprm->cred = NULL;
1018 1027
1019 /* cred_exec_mutex must be held at least to this point to prevent 1028 /* cred_guard_mutex must be held at least to this point to prevent
1020 * ptrace_attach() from altering our determination of the task's 1029 * ptrace_attach() from altering our determination of the task's
1021 * credentials; any time after this it may be unlocked */ 1030 * credentials; any time after this it may be unlocked */
1022 1031
@@ -1026,7 +1035,7 @@ EXPORT_SYMBOL(install_exec_creds);
1026 1035
1027/* 1036/*
1028 * determine how safe it is to execute the proposed program 1037 * determine how safe it is to execute the proposed program
1029 * - the caller must hold current->cred_exec_mutex to protect against 1038 * - the caller must hold current->cred_guard_mutex to protect against
1030 * PTRACE_ATTACH 1039 * PTRACE_ATTACH
1031 */ 1040 */
1032int check_unsafe_exec(struct linux_binprm *bprm) 1041int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1268,7 +1277,7 @@ int do_execve(char * filename,
1268 if (!bprm) 1277 if (!bprm)
1269 goto out_files; 1278 goto out_files;
1270 1279
1271 retval = mutex_lock_interruptible(&current->cred_exec_mutex); 1280 retval = mutex_lock_interruptible(&current->cred_guard_mutex);
1272 if (retval < 0) 1281 if (retval < 0)
1273 goto out_free; 1282 goto out_free;
1274 current->in_execve = 1; 1283 current->in_execve = 1;
@@ -1331,7 +1340,7 @@ int do_execve(char * filename,
1331 /* execve succeeded */ 1340 /* execve succeeded */
1332 current->fs->in_exec = 0; 1341 current->fs->in_exec = 0;
1333 current->in_execve = 0; 1342 current->in_execve = 0;
1334 mutex_unlock(&current->cred_exec_mutex); 1343 mutex_unlock(&current->cred_guard_mutex);
1335 acct_update_integrals(current); 1344 acct_update_integrals(current);
1336 free_bprm(bprm); 1345 free_bprm(bprm);
1337 if (displaced) 1346 if (displaced)
@@ -1354,7 +1363,7 @@ out_unmark:
1354 1363
1355out_unlock: 1364out_unlock:
1356 current->in_execve = 0; 1365 current->in_execve = 0;
1357 mutex_unlock(&current->cred_exec_mutex); 1366 mutex_unlock(&current->cred_guard_mutex);
1358 1367
1359out_free: 1368out_free:
1360 free_bprm(bprm); 1369 free_bprm(bprm);
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1512c4bb8c7..24667eedc023 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -175,10 +175,4 @@ int exofs_async_op(struct osd_request *or,
175 175
176int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr); 176int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
177 177
178int osd_req_read_kern(struct osd_request *or,
179 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
180
181int osd_req_write_kern(struct osd_request *or,
182 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
183
184#endif /*ifndef __EXOFS_COM_H__*/ 178#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ba8d9fab4693..77d0a295eb1c 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -59,10 +59,9 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
59 struct inode *inode) 59 struct inode *inode)
60{ 60{
61 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 61 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
62 struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
63 62
64 pcol->sbi = sbi; 63 pcol->sbi = sbi;
65 pcol->req_q = req_q; 64 pcol->req_q = osd_request_queue(sbi->s_dev);
66 pcol->inode = inode; 65 pcol->inode = inode;
67 pcol->expected_pages = expected_pages; 66 pcol->expected_pages = expected_pages;
68 67
@@ -266,7 +265,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
266 goto err; 265 goto err;
267 } 266 }
268 267
269 osd_req_read(or, &obj, pcol->bio, i_start); 268 osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
270 269
271 if (is_sync) { 270 if (is_sync) {
272 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred); 271 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
@@ -522,7 +521,8 @@ static int write_exec(struct page_collect *pcol)
522 521
523 *pcol_copy = *pcol; 522 *pcol_copy = *pcol;
524 523
525 osd_req_write(or, &obj, pcol_copy->bio, i_start); 524 pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
525 osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
526 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred); 526 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
527 if (unlikely(ret)) { 527 if (unlikely(ret)) {
528 EXOFS_ERR("write_exec: exofs_async_op() Faild\n"); 528 EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b249ae97fb15..b3d2ccb87aaa 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -50,10 +50,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
50 50
51 /* FIXME: should be include in osd_sense_info */ 51 /* FIXME: should be include in osd_sense_info */
52 if (in_resid) 52 if (in_resid)
53 *in_resid = or->in.req ? or->in.req->data_len : 0; 53 *in_resid = or->in.req ? or->in.req->resid_len : 0;
54 54
55 if (out_resid) 55 if (out_resid)
56 *out_resid = or->out.req ? or->out.req->data_len : 0; 56 *out_resid = or->out.req ? or->out.req->resid_len : 0;
57 57
58 return ret; 58 return ret;
59} 59}
@@ -125,29 +125,3 @@ int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
125 125
126 return -EIO; 126 return -EIO;
127} 127}
128
129int osd_req_read_kern(struct osd_request *or,
130 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
131{
132 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
133 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
134
135 if (!bio)
136 return -ENOMEM;
137
138 osd_req_read(or, obj, bio, offset);
139 return 0;
140}
141
142int osd_req_write_kern(struct osd_request *or,
143 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
144{
145 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
146 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
147
148 if (!bio)
149 return -ENOMEM;
150
151 osd_req_write(or, obj, bio, offset);
152 return 0;
153}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f1985e857e2..8216c5b77b53 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -200,20 +200,21 @@ static const struct export_operations exofs_export_ops;
200/* 200/*
201 * Write the superblock to the OSD 201 * Write the superblock to the OSD
202 */ 202 */
203static void exofs_write_super(struct super_block *sb) 203static int exofs_sync_fs(struct super_block *sb, int wait)
204{ 204{
205 struct exofs_sb_info *sbi; 205 struct exofs_sb_info *sbi;
206 struct exofs_fscb *fscb; 206 struct exofs_fscb *fscb;
207 struct osd_request *or; 207 struct osd_request *or;
208 struct osd_obj_id obj; 208 struct osd_obj_id obj;
209 int ret; 209 int ret = -ENOMEM;
210 210
211 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL); 211 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
212 if (!fscb) { 212 if (!fscb) {
213 EXOFS_ERR("exofs_write_super: memory allocation failed.\n"); 213 EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
214 return; 214 return -ENOMEM;
215 } 215 }
216 216
217 lock_super(sb);
217 lock_kernel(); 218 lock_kernel();
218 sbi = sb->s_fs_info; 219 sbi = sb->s_fs_info;
219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 220 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -246,7 +247,17 @@ out:
246 if (or) 247 if (or)
247 osd_end_request(or); 248 osd_end_request(or);
248 unlock_kernel(); 249 unlock_kernel();
250 unlock_super(sb);
249 kfree(fscb); 251 kfree(fscb);
252 return ret;
253}
254
255static void exofs_write_super(struct super_block *sb)
256{
257 if (!(sb->s_flags & MS_RDONLY))
258 exofs_sync_fs(sb, 1);
259 else
260 sb->s_dirt = 0;
250} 261}
251 262
252/* 263/*
@@ -258,6 +269,11 @@ static void exofs_put_super(struct super_block *sb)
258 int num_pend; 269 int num_pend;
259 struct exofs_sb_info *sbi = sb->s_fs_info; 270 struct exofs_sb_info *sbi = sb->s_fs_info;
260 271
272 lock_kernel();
273
274 if (sb->s_dirt)
275 exofs_write_super(sb);
276
261 /* make sure there are no pending commands */ 277 /* make sure there are no pending commands */
262 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; 278 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
263 num_pend = atomic_read(&sbi->s_curr_pending)) { 279 num_pend = atomic_read(&sbi->s_curr_pending)) {
@@ -271,6 +287,8 @@ static void exofs_put_super(struct super_block *sb)
271 osduld_put_device(sbi->s_dev); 287 osduld_put_device(sbi->s_dev);
272 kfree(sb->s_fs_info); 288 kfree(sb->s_fs_info);
273 sb->s_fs_info = NULL; 289 sb->s_fs_info = NULL;
290
291 unlock_kernel();
274} 292}
275 293
276/* 294/*
@@ -484,6 +502,7 @@ static const struct super_operations exofs_sops = {
484 .delete_inode = exofs_delete_inode, 502 .delete_inode = exofs_delete_inode,
485 .put_super = exofs_put_super, 503 .put_super = exofs_put_super,
486 .write_super = exofs_write_super, 504 .write_super = exofs_write_super,
505 .sync_fs = exofs_sync_fs,
487 .statfs = exofs_statfs, 506 .statfs = exofs_statfs,
488}; 507};
489 508
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index e0b2b43c1fdb..f42af45cfd88 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-$(CONFIG_EXT2_FS) += ext2.o 5obj-$(CONFIG_EXT2_FS) += ext2.o
6 6
7ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o 8 ioctl.o namei.o super.o symlink.o
9 9
10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2999d72153b7..003500498c22 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -720,5 +720,5 @@ const struct file_operations ext2_dir_operations = {
720#ifdef CONFIG_COMPAT 720#ifdef CONFIG_COMPAT
721 .compat_ioctl = ext2_compat_ioctl, 721 .compat_ioctl = ext2_compat_ioctl,
722#endif 722#endif
723 .fsync = ext2_sync_file, 723 .fsync = simple_fsync,
724}; 724};
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3203042b36ef..b2bbf45039e0 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -113,9 +113,6 @@ extern int ext2_empty_dir (struct inode *);
113extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); 113extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
114extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *); 114extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
115 115
116/* fsync.c */
117extern int ext2_sync_file (struct file *, struct dentry *, int);
118
119/* ialloc.c */ 116/* ialloc.c */
120extern struct inode * ext2_new_inode (struct inode *, int); 117extern struct inode * ext2_new_inode (struct inode *, int);
121extern void ext2_free_inode (struct inode *); 118extern void ext2_free_inode (struct inode *);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 45ed07122182..2b9e47dc9222 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = {
55 .mmap = generic_file_mmap, 55 .mmap = generic_file_mmap,
56 .open = generic_file_open, 56 .open = generic_file_open,
57 .release = ext2_release_file, 57 .release = ext2_release_file,
58 .fsync = ext2_sync_file, 58 .fsync = simple_fsync,
59 .splice_read = generic_file_splice_read, 59 .splice_read = generic_file_splice_read,
60 .splice_write = generic_file_splice_write, 60 .splice_write = generic_file_splice_write,
61}; 61};
@@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = {
72 .mmap = xip_file_mmap, 72 .mmap = xip_file_mmap,
73 .open = generic_file_open, 73 .open = generic_file_open,
74 .release = ext2_release_file, 74 .release = ext2_release_file,
75 .fsync = ext2_sync_file, 75 .fsync = simple_fsync,
76}; 76};
77#endif 77#endif
78 78
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
deleted file mode 100644
index fc66c93fcb5c..000000000000
--- a/fs/ext2/fsync.c
+++ /dev/null
@@ -1,50 +0,0 @@
1/*
2 * linux/fs/ext2/fsync.c
3 *
4 * Copyright (C) 1993 Stephen Tweedie (sct@dcs.ed.ac.uk)
5 * from
6 * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 * from
10 * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
11 *
12 * ext2fs fsync primitive
13 *
14 * Big-endian to little-endian byte-swapping/bitmaps by
15 * David S. Miller (davem@caip.rutgers.edu), 1995
16 *
17 * Removed unnecessary code duplication for little endian machines
18 * and excessive __inline__s.
19 * Andi Kleen, 1997
20 *
21 * Major simplications and cleanup - we only need to do the metadata, because
22 * we can depend on generic_block_fdatasync() to sync the data blocks.
23 */
24
25#include "ext2.h"
26#include <linux/buffer_head.h> /* for sync_mapping_buffers() */
27
28
29/*
30 * File may be NULL when we are called. Perhaps we shouldn't
31 * even pass file to fsync ?
32 */
33
34int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
35{
36 struct inode *inode = dentry->d_inode;
37 int err;
38 int ret;
39
40 ret = sync_mapping_buffers(inode->i_mapping);
41 if (!(inode->i_state & I_DIRTY))
42 return ret;
43 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
44 return ret;
45
46 err = ext2_sync_inode(inode);
47 if (ret == 0)
48 ret = err;
49 return ret;
50}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index acf678831103..29ed682061f6 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,8 +41,6 @@ MODULE_AUTHOR("Remy Card and others");
41MODULE_DESCRIPTION("Second Extended Filesystem"); 41MODULE_DESCRIPTION("Second Extended Filesystem");
42MODULE_LICENSE("GPL"); 42MODULE_LICENSE("GPL");
43 43
44static int ext2_update_inode(struct inode * inode, int do_sync);
45
46/* 44/*
47 * Test whether an inode is a fast symlink. 45 * Test whether an inode is a fast symlink.
48 */ 46 */
@@ -66,7 +64,7 @@ void ext2_delete_inode (struct inode * inode)
66 goto no_delete; 64 goto no_delete;
67 EXT2_I(inode)->i_dtime = get_seconds(); 65 EXT2_I(inode)->i_dtime = get_seconds();
68 mark_inode_dirty(inode); 66 mark_inode_dirty(inode);
69 ext2_update_inode(inode, inode_needs_sync(inode)); 67 ext2_write_inode(inode, inode_needs_sync(inode));
70 68
71 inode->i_size = 0; 69 inode->i_size = 0;
72 if (inode->i_blocks) 70 if (inode->i_blocks)
@@ -1337,7 +1335,7 @@ bad_inode:
1337 return ERR_PTR(ret); 1335 return ERR_PTR(ret);
1338} 1336}
1339 1337
1340static int ext2_update_inode(struct inode * inode, int do_sync) 1338int ext2_write_inode(struct inode *inode, int do_sync)
1341{ 1339{
1342 struct ext2_inode_info *ei = EXT2_I(inode); 1340 struct ext2_inode_info *ei = EXT2_I(inode);
1343 struct super_block *sb = inode->i_sb; 1341 struct super_block *sb = inode->i_sb;
@@ -1442,11 +1440,6 @@ static int ext2_update_inode(struct inode * inode, int do_sync)
1442 return err; 1440 return err;
1443} 1441}
1444 1442
1445int ext2_write_inode(struct inode *inode, int wait)
1446{
1447 return ext2_update_inode(inode, wait);
1448}
1449
1450int ext2_sync_inode(struct inode *inode) 1443int ext2_sync_inode(struct inode *inode)
1451{ 1444{
1452 struct writeback_control wbc = { 1445 struct writeback_control wbc = {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5c4afe652245..458999638c3d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,7 @@ static void ext2_sync_super(struct super_block *sb,
42 struct ext2_super_block *es); 42 struct ext2_super_block *es);
43static int ext2_remount (struct super_block * sb, int * flags, char * data); 43static int ext2_remount (struct super_block * sb, int * flags, char * data);
44static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 44static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
45static int ext2_sync_fs(struct super_block *sb, int wait);
45 46
46void ext2_error (struct super_block * sb, const char * function, 47void ext2_error (struct super_block * sb, const char * function,
47 const char * fmt, ...) 48 const char * fmt, ...)
@@ -114,6 +115,11 @@ static void ext2_put_super (struct super_block * sb)
114 int i; 115 int i;
115 struct ext2_sb_info *sbi = EXT2_SB(sb); 116 struct ext2_sb_info *sbi = EXT2_SB(sb);
116 117
118 lock_kernel();
119
120 if (sb->s_dirt)
121 ext2_write_super(sb);
122
117 ext2_xattr_put_super(sb); 123 ext2_xattr_put_super(sb);
118 if (!(sb->s_flags & MS_RDONLY)) { 124 if (!(sb->s_flags & MS_RDONLY)) {
119 struct ext2_super_block *es = sbi->s_es; 125 struct ext2_super_block *es = sbi->s_es;
@@ -135,7 +141,7 @@ static void ext2_put_super (struct super_block * sb)
135 kfree(sbi->s_blockgroup_lock); 141 kfree(sbi->s_blockgroup_lock);
136 kfree(sbi); 142 kfree(sbi);
137 143
138 return; 144 unlock_kernel();
139} 145}
140 146
141static struct kmem_cache * ext2_inode_cachep; 147static struct kmem_cache * ext2_inode_cachep;
@@ -304,6 +310,7 @@ static const struct super_operations ext2_sops = {
304 .delete_inode = ext2_delete_inode, 310 .delete_inode = ext2_delete_inode,
305 .put_super = ext2_put_super, 311 .put_super = ext2_put_super,
306 .write_super = ext2_write_super, 312 .write_super = ext2_write_super,
313 .sync_fs = ext2_sync_fs,
307 .statfs = ext2_statfs, 314 .statfs = ext2_statfs,
308 .remount_fs = ext2_remount, 315 .remount_fs = ext2_remount,
309 .clear_inode = ext2_clear_inode, 316 .clear_inode = ext2_clear_inode,
@@ -1093,6 +1100,7 @@ failed_mount:
1093 brelse(bh); 1100 brelse(bh);
1094failed_sbi: 1101failed_sbi:
1095 sb->s_fs_info = NULL; 1102 sb->s_fs_info = NULL;
1103 kfree(sbi->s_blockgroup_lock);
1096 kfree(sbi); 1104 kfree(sbi);
1097 return ret; 1105 return ret;
1098} 1106}
@@ -1126,25 +1134,36 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1126 * set s_state to EXT2_VALID_FS after some corrections. 1134 * set s_state to EXT2_VALID_FS after some corrections.
1127 */ 1135 */
1128 1136
1129void ext2_write_super (struct super_block * sb) 1137static int ext2_sync_fs(struct super_block *sb, int wait)
1130{ 1138{
1131 struct ext2_super_block * es; 1139 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
1140
1132 lock_kernel(); 1141 lock_kernel();
1133 if (!(sb->s_flags & MS_RDONLY)) { 1142 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
1134 es = EXT2_SB(sb)->s_es; 1143 ext2_debug("setting valid to 0\n");
1135 1144 es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
1136 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { 1145 es->s_free_blocks_count =
1137 ext2_debug ("setting valid to 0\n"); 1146 cpu_to_le32(ext2_count_free_blocks(sb));
1138 es->s_state &= cpu_to_le16(~EXT2_VALID_FS); 1147 es->s_free_inodes_count =
1139 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); 1148 cpu_to_le32(ext2_count_free_inodes(sb));
1140 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); 1149 es->s_mtime = cpu_to_le32(get_seconds());
1141 es->s_mtime = cpu_to_le32(get_seconds()); 1150 ext2_sync_super(sb, es);
1142 ext2_sync_super(sb, es); 1151 } else {
1143 } else 1152 ext2_commit_super(sb, es);
1144 ext2_commit_super (sb, es);
1145 } 1153 }
1146 sb->s_dirt = 0; 1154 sb->s_dirt = 0;
1147 unlock_kernel(); 1155 unlock_kernel();
1156
1157 return 0;
1158}
1159
1160
1161void ext2_write_super(struct super_block *sb)
1162{
1163 if (!(sb->s_flags & MS_RDONLY))
1164 ext2_sync_fs(sb, 1);
1165 else
1166 sb->s_dirt = 0;
1148} 1167}
1149 1168
1150static int ext2_remount (struct super_block * sb, int * flags, char * data) 1169static int ext2_remount (struct super_block * sb, int * flags, char * data)
@@ -1156,6 +1175,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1156 unsigned long old_sb_flags; 1175 unsigned long old_sb_flags;
1157 int err; 1176 int err;
1158 1177
1178 lock_kernel();
1179
1159 /* Store the old options */ 1180 /* Store the old options */
1160 old_sb_flags = sb->s_flags; 1181 old_sb_flags = sb->s_flags;
1161 old_opts.s_mount_opt = sbi->s_mount_opt; 1182 old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -1191,12 +1212,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1191 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; 1212 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
1192 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; 1213 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1193 } 1214 }
1194 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1215 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1216 unlock_kernel();
1195 return 0; 1217 return 0;
1218 }
1196 if (*flags & MS_RDONLY) { 1219 if (*flags & MS_RDONLY) {
1197 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || 1220 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
1198 !(sbi->s_mount_state & EXT2_VALID_FS)) 1221 !(sbi->s_mount_state & EXT2_VALID_FS)) {
1222 unlock_kernel();
1199 return 0; 1223 return 0;
1224 }
1200 /* 1225 /*
1201 * OK, we are remounting a valid rw partition rdonly, so set 1226 * OK, we are remounting a valid rw partition rdonly, so set
1202 * the rdonly flag and then mark the partition as valid again. 1227 * the rdonly flag and then mark the partition as valid again.
@@ -1223,12 +1248,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1223 sb->s_flags &= ~MS_RDONLY; 1248 sb->s_flags &= ~MS_RDONLY;
1224 } 1249 }
1225 ext2_sync_super(sb, es); 1250 ext2_sync_super(sb, es);
1251 unlock_kernel();
1226 return 0; 1252 return 0;
1227restore_opts: 1253restore_opts:
1228 sbi->s_mount_opt = old_opts.s_mount_opt; 1254 sbi->s_mount_opt = old_opts.s_mount_opt;
1229 sbi->s_resuid = old_opts.s_resuid; 1255 sbi->s_resuid = old_opts.s_resuid;
1230 sbi->s_resgid = old_opts.s_resgid; 1256 sbi->s_resgid = old_opts.s_resgid;
1231 sb->s_flags = old_sb_flags; 1257 sb->s_flags = old_sb_flags;
1258 unlock_kernel();
1232 return err; 1259 return err;
1233} 1260}
1234 1261
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 225202db8974..27967f92e820 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -649,7 +649,7 @@ do_more:
649 count = overflow; 649 count = overflow;
650 goto do_more; 650 goto do_more;
651 } 651 }
652 sb->s_dirt = 1; 652
653error_return: 653error_return:
654 brelse(bitmap_bh); 654 brelse(bitmap_bh);
655 ext3_std_error(sb, err); 655 ext3_std_error(sb, err);
@@ -1708,7 +1708,6 @@ allocated:
1708 if (!fatal) 1708 if (!fatal)
1709 fatal = err; 1709 fatal = err;
1710 1710
1711 sb->s_dirt = 1;
1712 if (fatal) 1711 if (fatal)
1713 goto out; 1712 goto out;
1714 1713
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dd13d60d524b..b39991285136 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
181 err = ext3_journal_dirty_metadata(handle, bitmap_bh); 181 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
182 if (!fatal) 182 if (!fatal)
183 fatal = err; 183 fatal = err;
184 sb->s_dirt = 1; 184
185error_return: 185error_return:
186 brelse(bitmap_bh); 186 brelse(bitmap_bh);
187 ext3_std_error(sb, fatal); 187 ext3_std_error(sb, fatal);
@@ -537,7 +537,6 @@ got:
537 percpu_counter_dec(&sbi->s_freeinodes_counter); 537 percpu_counter_dec(&sbi->s_freeinodes_counter);
538 if (S_ISDIR(mode)) 538 if (S_ISDIR(mode))
539 percpu_counter_inc(&sbi->s_dirs_counter); 539 percpu_counter_inc(&sbi->s_dirs_counter);
540 sb->s_dirt = 1;
541 540
542 inode->i_uid = current_fsuid(); 541 inode->i_uid = current_fsuid();
543 if (test_opt (sb, GRPID)) 542 if (test_opt (sb, GRPID))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index fcfa24361856..b0248c6d5d4c 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2960,7 +2960,6 @@ static int ext3_do_update_inode(handle_t *handle,
2960 ext3_update_dynamic_rev(sb); 2960 ext3_update_dynamic_rev(sb);
2961 EXT3_SET_RO_COMPAT_FEATURE(sb, 2961 EXT3_SET_RO_COMPAT_FEATURE(sb,
2962 EXT3_FEATURE_RO_COMPAT_LARGE_FILE); 2962 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2963 sb->s_dirt = 1;
2964 handle->h_sync = 1; 2963 handle->h_sync = 1;
2965 err = ext3_journal_dirty_metadata(handle, 2964 err = ext3_journal_dirty_metadata(handle,
2966 EXT3_SB(sb)->s_sbh); 2965 EXT3_SB(sb)->s_sbh);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 78fdf3836370..8a0b26340b54 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -934,7 +934,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
934 EXT3_INODES_PER_GROUP(sb)); 934 EXT3_INODES_PER_GROUP(sb));
935 935
936 ext3_journal_dirty_metadata(handle, sbi->s_sbh); 936 ext3_journal_dirty_metadata(handle, sbi->s_sbh);
937 sb->s_dirt = 1;
938 937
939exit_journal: 938exit_journal:
940 unlock_super(sb); 939 unlock_super(sb);
@@ -1066,7 +1065,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1066 } 1065 }
1067 es->s_blocks_count = cpu_to_le32(o_blocks_count + add); 1066 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1068 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 1067 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1069 sb->s_dirt = 1;
1070 unlock_super(sb); 1068 unlock_super(sb);
1071 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, 1069 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
1072 o_blocks_count + add); 1070 o_blocks_count + add);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c3..26aa64dee6aa 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -67,7 +67,6 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
67static int ext3_remount (struct super_block * sb, int * flags, char * data); 67static int ext3_remount (struct super_block * sb, int * flags, char * data);
68static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); 68static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
69static int ext3_unfreeze(struct super_block *sb); 69static int ext3_unfreeze(struct super_block *sb);
70static void ext3_write_super (struct super_block * sb);
71static int ext3_freeze(struct super_block *sb); 70static int ext3_freeze(struct super_block *sb);
72 71
73/* 72/*
@@ -399,6 +398,8 @@ static void ext3_put_super (struct super_block * sb)
399 struct ext3_super_block *es = sbi->s_es; 398 struct ext3_super_block *es = sbi->s_es;
400 int i, err; 399 int i, err;
401 400
401 lock_kernel();
402
402 ext3_xattr_put_super(sb); 403 ext3_xattr_put_super(sb);
403 err = journal_destroy(sbi->s_journal); 404 err = journal_destroy(sbi->s_journal);
404 sbi->s_journal = NULL; 405 sbi->s_journal = NULL;
@@ -447,7 +448,8 @@ static void ext3_put_super (struct super_block * sb)
447 sb->s_fs_info = NULL; 448 sb->s_fs_info = NULL;
448 kfree(sbi->s_blockgroup_lock); 449 kfree(sbi->s_blockgroup_lock);
449 kfree(sbi); 450 kfree(sbi);
450 return; 451
452 unlock_kernel();
451} 453}
452 454
453static struct kmem_cache *ext3_inode_cachep; 455static struct kmem_cache *ext3_inode_cachep;
@@ -761,7 +763,6 @@ static const struct super_operations ext3_sops = {
761 .dirty_inode = ext3_dirty_inode, 763 .dirty_inode = ext3_dirty_inode,
762 .delete_inode = ext3_delete_inode, 764 .delete_inode = ext3_delete_inode,
763 .put_super = ext3_put_super, 765 .put_super = ext3_put_super,
764 .write_super = ext3_write_super,
765 .sync_fs = ext3_sync_fs, 766 .sync_fs = ext3_sync_fs,
766 .freeze_fs = ext3_freeze, 767 .freeze_fs = ext3_freeze,
767 .unfreeze_fs = ext3_unfreeze, 768 .unfreeze_fs = ext3_unfreeze,
@@ -1696,7 +1697,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1696 goto failed_mount; 1697 goto failed_mount;
1697 } 1698 }
1698 1699
1699 hblock = bdev_hardsect_size(sb->s_bdev); 1700 hblock = bdev_logical_block_size(sb->s_bdev);
1700 if (sb->s_blocksize != blocksize) { 1701 if (sb->s_blocksize != blocksize) {
1701 /* 1702 /*
1702 * Make sure the blocksize for the filesystem is larger 1703 * Make sure the blocksize for the filesystem is larger
@@ -1785,7 +1786,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1785#else 1786#else
1786 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 1787 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
1787#endif 1788#endif
1788 sb->s_dirt = 1;
1789 } 1789 }
1790 1790
1791 if (sbi->s_blocks_per_group > blocksize * 8) { 1791 if (sbi->s_blocks_per_group > blocksize * 8) {
@@ -2021,6 +2021,7 @@ failed_mount:
2021 brelse(bh); 2021 brelse(bh);
2022out_fail: 2022out_fail:
2023 sb->s_fs_info = NULL; 2023 sb->s_fs_info = NULL;
2024 kfree(sbi->s_blockgroup_lock);
2024 kfree(sbi); 2025 kfree(sbi);
2025 lock_kernel(); 2026 lock_kernel();
2026 return ret; 2027 return ret;
@@ -2119,7 +2120,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2119 } 2120 }
2120 2121
2121 blocksize = sb->s_blocksize; 2122 blocksize = sb->s_blocksize;
2122 hblock = bdev_hardsect_size(bdev); 2123 hblock = bdev_logical_block_size(bdev);
2123 if (blocksize < hblock) { 2124 if (blocksize < hblock) {
2124 printk(KERN_ERR 2125 printk(KERN_ERR
2125 "EXT3-fs: blocksize too small for journal device.\n"); 2126 "EXT3-fs: blocksize too small for journal device.\n");
@@ -2264,7 +2265,6 @@ static int ext3_load_journal(struct super_block *sb,
2264 if (journal_devnum && 2265 if (journal_devnum &&
2265 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2266 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2266 es->s_journal_dev = cpu_to_le32(journal_devnum); 2267 es->s_journal_dev = cpu_to_le32(journal_devnum);
2267 sb->s_dirt = 1;
2268 2268
2269 /* Make sure we flush the recovery flag to disk. */ 2269 /* Make sure we flush the recovery flag to disk. */
2270 ext3_commit_super(sb, es, 1); 2270 ext3_commit_super(sb, es, 1);
@@ -2307,7 +2307,6 @@ static int ext3_create_journal(struct super_block * sb,
2307 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); 2307 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
2308 2308
2309 es->s_journal_inum = cpu_to_le32(journal_inum); 2309 es->s_journal_inum = cpu_to_le32(journal_inum);
2310 sb->s_dirt = 1;
2311 2310
2312 /* Make sure we flush the recovery flag to disk. */ 2311 /* Make sure we flush the recovery flag to disk. */
2313 ext3_commit_super(sb, es, 1); 2312 ext3_commit_super(sb, es, 1);
@@ -2353,7 +2352,6 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
2353 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && 2352 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
2354 sb->s_flags & MS_RDONLY) { 2353 sb->s_flags & MS_RDONLY) {
2355 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2354 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2356 sb->s_dirt = 0;
2357 ext3_commit_super(sb, es, 1); 2355 ext3_commit_super(sb, es, 1);
2358 } 2356 }
2359 unlock_super(sb); 2357 unlock_super(sb);
@@ -2412,29 +2410,14 @@ int ext3_force_commit(struct super_block *sb)
2412 return 0; 2410 return 0;
2413 2411
2414 journal = EXT3_SB(sb)->s_journal; 2412 journal = EXT3_SB(sb)->s_journal;
2415 sb->s_dirt = 0;
2416 ret = ext3_journal_force_commit(journal); 2413 ret = ext3_journal_force_commit(journal);
2417 return ret; 2414 return ret;
2418} 2415}
2419 2416
2420/*
2421 * Ext3 always journals updates to the superblock itself, so we don't
2422 * have to propagate any other updates to the superblock on disk at this
2423 * point. (We can probably nuke this function altogether, and remove
2424 * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
2425 */
2426static void ext3_write_super (struct super_block * sb)
2427{
2428 if (mutex_trylock(&sb->s_lock) != 0)
2429 BUG();
2430 sb->s_dirt = 0;
2431}
2432
2433static int ext3_sync_fs(struct super_block *sb, int wait) 2417static int ext3_sync_fs(struct super_block *sb, int wait)
2434{ 2418{
2435 tid_t target; 2419 tid_t target;
2436 2420
2437 sb->s_dirt = 0;
2438 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { 2421 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2439 if (wait) 2422 if (wait)
2440 log_wait_commit(EXT3_SB(sb)->s_journal, target); 2423 log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2450,7 +2433,6 @@ static int ext3_freeze(struct super_block *sb)
2450{ 2433{
2451 int error = 0; 2434 int error = 0;
2452 journal_t *journal; 2435 journal_t *journal;
2453 sb->s_dirt = 0;
2454 2436
2455 if (!(sb->s_flags & MS_RDONLY)) { 2437 if (!(sb->s_flags & MS_RDONLY)) {
2456 journal = EXT3_SB(sb)->s_journal; 2438 journal = EXT3_SB(sb)->s_journal;
@@ -2508,7 +2490,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2508 int i; 2490 int i;
2509#endif 2491#endif
2510 2492
2493 lock_kernel();
2494
2511 /* Store the original options */ 2495 /* Store the original options */
2496 lock_super(sb);
2512 old_sb_flags = sb->s_flags; 2497 old_sb_flags = sb->s_flags;
2513 old_opts.s_mount_opt = sbi->s_mount_opt; 2498 old_opts.s_mount_opt = sbi->s_mount_opt;
2514 old_opts.s_resuid = sbi->s_resuid; 2499 old_opts.s_resuid = sbi->s_resuid;
@@ -2616,6 +2601,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2616 old_opts.s_qf_names[i] != sbi->s_qf_names[i]) 2601 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2617 kfree(old_opts.s_qf_names[i]); 2602 kfree(old_opts.s_qf_names[i]);
2618#endif 2603#endif
2604 unlock_super(sb);
2605 unlock_kernel();
2619 return 0; 2606 return 0;
2620restore_opts: 2607restore_opts:
2621 sb->s_flags = old_sb_flags; 2608 sb->s_flags = old_sb_flags;
@@ -2632,6 +2619,8 @@ restore_opts:
2632 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 2619 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2633 } 2620 }
2634#endif 2621#endif
2622 unlock_super(sb);
2623 unlock_kernel();
2635 return err; 2624 return err;
2636} 2625}
2637 2626
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 83b7be849bd5..545e37c4b91e 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -463,7 +463,6 @@ static void ext3_xattr_update_super_block(handle_t *handle,
463 463
464 if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { 464 if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
465 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR); 465 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
466 sb->s_dirt = 1;
467 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 466 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
468 } 467 }
469} 468}
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a8ff003a00f7..8a34710ecf40 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,8 +5,8 @@
5obj-$(CONFIG_EXT4_FS) += ext4.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o
10 10
11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad85877..e2126d70dff5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -19,7 +19,6 @@
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include "ext4.h" 20#include "ext4.h"
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "group.h"
23#include "mballoc.h" 22#include "mballoc.h"
24 23
25/* 24/*
@@ -88,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
88 ext4_group_t block_group, struct ext4_group_desc *gdp) 87 ext4_group_t block_group, struct ext4_group_desc *gdp)
89{ 88{
90 int bit, bit_max; 89 int bit, bit_max;
90 ext4_group_t ngroups = ext4_get_groups_count(sb);
91 unsigned free_blocks, group_blocks; 91 unsigned free_blocks, group_blocks;
92 struct ext4_sb_info *sbi = EXT4_SB(sb); 92 struct ext4_sb_info *sbi = EXT4_SB(sb);
93 93
@@ -123,7 +123,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
123 bit_max += ext4_bg_num_gdb(sb, block_group); 123 bit_max += ext4_bg_num_gdb(sb, block_group);
124 } 124 }
125 125
126 if (block_group == sbi->s_groups_count - 1) { 126 if (block_group == ngroups - 1) {
127 /* 127 /*
128 * Even though mke2fs always initialize first and last group 128 * Even though mke2fs always initialize first and last group
129 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need 129 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +131,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
131 */ 131 */
132 group_blocks = ext4_blocks_count(sbi->s_es) - 132 group_blocks = ext4_blocks_count(sbi->s_es) -
133 le32_to_cpu(sbi->s_es->s_first_data_block) - 133 le32_to_cpu(sbi->s_es->s_first_data_block) -
134 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1)); 134 (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
135 } else { 135 } else {
136 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 136 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
137 } 137 }
@@ -205,18 +205,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
205{ 205{
206 unsigned int group_desc; 206 unsigned int group_desc;
207 unsigned int offset; 207 unsigned int offset;
208 ext4_group_t ngroups = ext4_get_groups_count(sb);
208 struct ext4_group_desc *desc; 209 struct ext4_group_desc *desc;
209 struct ext4_sb_info *sbi = EXT4_SB(sb); 210 struct ext4_sb_info *sbi = EXT4_SB(sb);
210 211
211 if (block_group >= sbi->s_groups_count) { 212 if (block_group >= ngroups) {
212 ext4_error(sb, "ext4_get_group_desc", 213 ext4_error(sb, "ext4_get_group_desc",
213 "block_group >= groups_count - " 214 "block_group >= groups_count - "
214 "block_group = %u, groups_count = %u", 215 "block_group = %u, groups_count = %u",
215 block_group, sbi->s_groups_count); 216 block_group, ngroups);
216 217
217 return NULL; 218 return NULL;
218 } 219 }
219 smp_rmb();
220 220
221 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 221 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
222 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 222 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
326 unlock_buffer(bh); 326 unlock_buffer(bh);
327 return bh; 327 return bh;
328 } 328 }
329 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 329 ext4_lock_group(sb, block_group);
330 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 330 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
331 ext4_init_block_bitmap(sb, bh, block_group, desc); 331 ext4_init_block_bitmap(sb, bh, block_group, desc);
332 set_bitmap_uptodate(bh); 332 set_bitmap_uptodate(bh);
333 set_buffer_uptodate(bh); 333 set_buffer_uptodate(bh);
334 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 334 ext4_unlock_group(sb, block_group);
335 unlock_buffer(bh); 335 unlock_buffer(bh);
336 return bh; 336 return bh;
337 } 337 }
338 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 338 ext4_unlock_group(sb, block_group);
339 if (buffer_uptodate(bh)) { 339 if (buffer_uptodate(bh)) {
340 /* 340 /*
341 * if not uninit if bh is uptodate, 341 * if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
451 down_write(&grp->alloc_sem); 451 down_write(&grp->alloc_sem);
452 for (i = 0, blocks_freed = 0; i < count; i++) { 452 for (i = 0, blocks_freed = 0; i < count; i++) {
453 BUFFER_TRACE(bitmap_bh, "clear bit"); 453 BUFFER_TRACE(bitmap_bh, "clear bit");
454 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 454 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
455 bit + i, bitmap_bh->b_data)) { 455 bit + i, bitmap_bh->b_data)) {
456 ext4_error(sb, __func__, 456 ext4_error(sb, __func__,
457 "bit already cleared for block %llu", 457 "bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
461 blocks_freed++; 461 blocks_freed++;
462 } 462 }
463 } 463 }
464 spin_lock(sb_bgl_lock(sbi, block_group)); 464 ext4_lock_group(sb, block_group);
465 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); 465 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
466 ext4_free_blks_set(sb, desc, blk_free_count); 466 ext4_free_blks_set(sb, desc, blk_free_count);
467 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 467 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
468 spin_unlock(sb_bgl_lock(sbi, block_group)); 468 ext4_unlock_group(sb, block_group);
469 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); 469 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
470 470
471 if (sbi->s_log_groups_per_flex) { 471 if (sbi->s_log_groups_per_flex) {
@@ -665,7 +665,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
665 ext4_fsblk_t desc_count; 665 ext4_fsblk_t desc_count;
666 struct ext4_group_desc *gdp; 666 struct ext4_group_desc *gdp;
667 ext4_group_t i; 667 ext4_group_t i;
668 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 668 ext4_group_t ngroups = ext4_get_groups_count(sb);
669#ifdef EXT4FS_DEBUG 669#ifdef EXT4FS_DEBUG
670 struct ext4_super_block *es; 670 struct ext4_super_block *es;
671 ext4_fsblk_t bitmap_count; 671 ext4_fsblk_t bitmap_count;
@@ -677,7 +677,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
677 bitmap_count = 0; 677 bitmap_count = 0;
678 gdp = NULL; 678 gdp = NULL;
679 679
680 smp_rmb();
681 for (i = 0; i < ngroups; i++) { 680 for (i = 0; i < ngroups; i++) {
682 gdp = ext4_get_group_desc(sb, i, NULL); 681 gdp = ext4_get_group_desc(sb, i, NULL);
683 if (!gdp) 682 if (!gdp)
@@ -700,7 +699,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
700 return bitmap_count; 699 return bitmap_count;
701#else 700#else
702 desc_count = 0; 701 desc_count = 0;
703 smp_rmb();
704 for (i = 0; i < ngroups; i++) { 702 for (i = 0; i < ngroups; i++) {
705 gdp = ext4_get_group_desc(sb, i, NULL); 703 gdp = ext4_get_group_desc(sb, i, NULL);
706 if (!gdp) 704 if (!gdp)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 000000000000..50784ef07563
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,244 @@
1/*
2 * linux/fs/ext4/block_validity.c
3 *
4 * Copyright (C) 2009
5 * Theodore Ts'o (tytso@mit.edu)
6 *
7 * Track which blocks in the filesystem are metadata blocks that
8 * should never be used as data blocks by files or directories.
9 */
10
11#include <linux/time.h>
12#include <linux/fs.h>
13#include <linux/namei.h>
14#include <linux/quotaops.h>
15#include <linux/buffer_head.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/pagemap.h>
19#include <linux/version.h>
20#include <linux/blkdev.h>
21#include <linux/mutex.h>
22#include "ext4.h"
23
24struct ext4_system_zone {
25 struct rb_node node;
26 ext4_fsblk_t start_blk;
27 unsigned int count;
28};
29
30static struct kmem_cache *ext4_system_zone_cachep;
31
32int __init init_ext4_system_zone(void)
33{
34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
35 SLAB_RECLAIM_ACCOUNT);
36 if (ext4_system_zone_cachep == NULL)
37 return -ENOMEM;
38 return 0;
39}
40
41void exit_ext4_system_zone(void)
42{
43 kmem_cache_destroy(ext4_system_zone_cachep);
44}
45
46static inline int can_merge(struct ext4_system_zone *entry1,
47 struct ext4_system_zone *entry2)
48{
49 if ((entry1->start_blk + entry1->count) == entry2->start_blk)
50 return 1;
51 return 0;
52}
53
54/*
55 * Mark a range of blocks as belonging to the "system zone" --- that
56 * is, filesystem metadata blocks which should never be used by
57 * inodes.
58 */
59static int add_system_zone(struct ext4_sb_info *sbi,
60 ext4_fsblk_t start_blk,
61 unsigned int count)
62{
63 struct ext4_system_zone *new_entry = NULL, *entry;
64 struct rb_node **n = &sbi->system_blks.rb_node, *node;
65 struct rb_node *parent = NULL, *new_node = NULL;
66
67 while (*n) {
68 parent = *n;
69 entry = rb_entry(parent, struct ext4_system_zone, node);
70 if (start_blk < entry->start_blk)
71 n = &(*n)->rb_left;
72 else if (start_blk >= (entry->start_blk + entry->count))
73 n = &(*n)->rb_right;
74 else {
75 if (start_blk + count > (entry->start_blk +
76 entry->count))
77 entry->count = (start_blk + count -
78 entry->start_blk);
79 new_node = *n;
80 new_entry = rb_entry(new_node, struct ext4_system_zone,
81 node);
82 break;
83 }
84 }
85
86 if (!new_entry) {
87 new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
88 GFP_KERNEL);
89 if (!new_entry)
90 return -ENOMEM;
91 new_entry->start_blk = start_blk;
92 new_entry->count = count;
93 new_node = &new_entry->node;
94
95 rb_link_node(new_node, parent, n);
96 rb_insert_color(new_node, &sbi->system_blks);
97 }
98
99 /* Can we merge to the left? */
100 node = rb_prev(new_node);
101 if (node) {
102 entry = rb_entry(node, struct ext4_system_zone, node);
103 if (can_merge(entry, new_entry)) {
104 new_entry->start_blk = entry->start_blk;
105 new_entry->count += entry->count;
106 rb_erase(node, &sbi->system_blks);
107 kmem_cache_free(ext4_system_zone_cachep, entry);
108 }
109 }
110
111 /* Can we merge to the right? */
112 node = rb_next(new_node);
113 if (node) {
114 entry = rb_entry(node, struct ext4_system_zone, node);
115 if (can_merge(new_entry, entry)) {
116 new_entry->count += entry->count;
117 rb_erase(node, &sbi->system_blks);
118 kmem_cache_free(ext4_system_zone_cachep, entry);
119 }
120 }
121 return 0;
122}
123
124static void debug_print_tree(struct ext4_sb_info *sbi)
125{
126 struct rb_node *node;
127 struct ext4_system_zone *entry;
128 int first = 1;
129
130 printk(KERN_INFO "System zones: ");
131 node = rb_first(&sbi->system_blks);
132 while (node) {
133 entry = rb_entry(node, struct ext4_system_zone, node);
134 printk("%s%llu-%llu", first ? "" : ", ",
135 entry->start_blk, entry->start_blk + entry->count - 1);
136 first = 0;
137 node = rb_next(node);
138 }
139 printk("\n");
140}
141
142int ext4_setup_system_zone(struct super_block *sb)
143{
144 ext4_group_t ngroups = ext4_get_groups_count(sb);
145 struct ext4_sb_info *sbi = EXT4_SB(sb);
146 struct ext4_group_desc *gdp;
147 ext4_group_t i;
148 int flex_size = ext4_flex_bg_size(sbi);
149 int ret;
150
151 if (!test_opt(sb, BLOCK_VALIDITY)) {
152 if (EXT4_SB(sb)->system_blks.rb_node)
153 ext4_release_system_zone(sb);
154 return 0;
155 }
156 if (EXT4_SB(sb)->system_blks.rb_node)
157 return 0;
158
159 for (i=0; i < ngroups; i++) {
160 if (ext4_bg_has_super(sb, i) &&
161 ((i < 5) || ((i % flex_size) == 0)))
162 add_system_zone(sbi, ext4_group_first_block_no(sb, i),
163 sbi->s_gdb_count + 1);
164 gdp = ext4_get_group_desc(sb, i, NULL);
165 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
166 if (ret)
167 return ret;
168 ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
169 if (ret)
170 return ret;
171 ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
172 sbi->s_itb_per_group);
173 if (ret)
174 return ret;
175 }
176
177 if (test_opt(sb, DEBUG))
178 debug_print_tree(EXT4_SB(sb));
179 return 0;
180}
181
182/* Called when the filesystem is unmounted */
183void ext4_release_system_zone(struct super_block *sb)
184{
185 struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node;
186 struct rb_node *parent;
187 struct ext4_system_zone *entry;
188
189 while (n) {
190 /* Do the node's children first */
191 if (n->rb_left) {
192 n = n->rb_left;
193 continue;
194 }
195 if (n->rb_right) {
196 n = n->rb_right;
197 continue;
198 }
199 /*
200 * The node has no children; free it, and then zero
201 * out parent's link to it. Finally go to the
202 * beginning of the loop and try to free the parent
203 * node.
204 */
205 parent = rb_parent(n);
206 entry = rb_entry(n, struct ext4_system_zone, node);
207 kmem_cache_free(ext4_system_zone_cachep, entry);
208 if (!parent)
209 EXT4_SB(sb)->system_blks.rb_node = NULL;
210 else if (parent->rb_left == n)
211 parent->rb_left = NULL;
212 else if (parent->rb_right == n)
213 parent->rb_right = NULL;
214 n = parent;
215 }
216 EXT4_SB(sb)->system_blks.rb_node = NULL;
217}
218
219/*
220 * Returns 1 if the passed-in block region (start_blk,
221 * start_blk+count) is valid; 0 if some part of the block region
222 * overlaps with filesystem metadata blocks.
223 */
224int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
225 unsigned int count)
226{
227 struct ext4_system_zone *entry;
228 struct rb_node *n = sbi->system_blks.rb_node;
229
230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
231 (start_blk + count > ext4_blocks_count(sbi->s_es)))
232 return 0;
233 while (n) {
234 entry = rb_entry(n, struct ext4_system_zone, node);
235 if (start_blk + count - 1 < entry->start_blk)
236 n = n->rb_left;
237 else if (start_blk >= (entry->start_blk + entry->count))
238 n = n->rb_right;
239 else
240 return 0;
241 }
242 return 1;
243}
244
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b64789929a65..9dc93168e262 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
131 struct buffer_head *bh = NULL; 131 struct buffer_head *bh = NULL;
132 132
133 map_bh.b_state = 0; 133 map_bh.b_state = 0;
134 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 134 err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
135 0, 0, 0);
136 if (err > 0) { 135 if (err > 0) {
137 pgoff_t index = map_bh.b_blocknr >> 136 pgoff_t index = map_bh.b_blocknr >>
138 (PAGE_CACHE_SHIFT - inode->i_blkbits); 137 (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d0f15ef56de1..cc7d5edc38c9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -21,7 +21,14 @@
21#include <linux/magic.h> 21#include <linux/magic.h>
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/quota.h> 23#include <linux/quota.h>
24#include "ext4_i.h" 24#include <linux/rwsem.h>
25#include <linux/rbtree.h>
26#include <linux/seqlock.h>
27#include <linux/mutex.h>
28#include <linux/timer.h>
29#include <linux/wait.h>
30#include <linux/blockgroup_lock.h>
31#include <linux/percpu_counter.h>
25 32
26/* 33/*
27 * The fourth extended filesystem constants/structures 34 * The fourth extended filesystem constants/structures
@@ -46,6 +53,19 @@
46#define ext4_debug(f, a...) do {} while (0) 53#define ext4_debug(f, a...) do {} while (0)
47#endif 54#endif
48 55
56/* data type for block offset of block group */
57typedef int ext4_grpblk_t;
58
59/* data type for filesystem-wide blocks number */
60typedef unsigned long long ext4_fsblk_t;
61
62/* data type for file logical block number */
63typedef __u32 ext4_lblk_t;
64
65/* data type for block group number */
66typedef unsigned int ext4_group_t;
67
68
49/* prefer goal again. length */ 69/* prefer goal again. length */
50#define EXT4_MB_HINT_MERGE 1 70#define EXT4_MB_HINT_MERGE 1
51/* blocks already reserved */ 71/* blocks already reserved */
@@ -179,9 +199,6 @@ struct flex_groups {
179#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ 199#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
180#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ 200#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
181 201
182#ifdef __KERNEL__
183#include "ext4_sb.h"
184#endif
185/* 202/*
186 * Macro-instructions used to manage group descriptors 203 * Macro-instructions used to manage group descriptors
187 */ 204 */
@@ -297,10 +314,23 @@ struct ext4_new_group_data {
297}; 314};
298 315
299/* 316/*
300 * Following is used by preallocation code to tell get_blocks() that we 317 * Flags used by ext4_get_blocks()
301 * want uninitialzed extents.
302 */ 318 */
303#define EXT4_CREATE_UNINITIALIZED_EXT 2 319 /* Allocate any needed blocks and/or convert an unitialized
320 extent to be an initialized ext4 */
321#define EXT4_GET_BLOCKS_CREATE 0x0001
322 /* Request the creation of an unitialized extent */
323#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
324#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
325 EXT4_GET_BLOCKS_CREATE)
326 /* Caller is from the delayed allocation writeout path,
327 so set the magic i_delalloc_reserve_flag after taking the
328 inode allocation semaphore for */
329#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
330 /* Call ext4_da_update_reserve_space() after successfully
331 allocating the blocks */
332#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
333
304 334
305/* 335/*
306 * ioctl commands 336 * ioctl commands
@@ -516,6 +546,110 @@ do { \
516#endif /* defined(__KERNEL__) || defined(__linux__) */ 546#endif /* defined(__KERNEL__) || defined(__linux__) */
517 547
518/* 548/*
549 * storage for cached extent
550 */
551struct ext4_ext_cache {
552 ext4_fsblk_t ec_start;
553 ext4_lblk_t ec_block;
554 __u32 ec_len; /* must be 32bit to return holes */
555 __u32 ec_type;
556};
557
558/*
559 * fourth extended file system inode data in memory
560 */
561struct ext4_inode_info {
562 __le32 i_data[15]; /* unconverted */
563 __u32 i_flags;
564 ext4_fsblk_t i_file_acl;
565 __u32 i_dtime;
566
567 /*
568 * i_block_group is the number of the block group which contains
569 * this file's inode. Constant across the lifetime of the inode,
570 * it is ued for making block allocation decisions - we try to
571 * place a file's data blocks near its inode block, and new inodes
572 * near to their parent directory's inode.
573 */
574 ext4_group_t i_block_group;
575 __u32 i_state; /* Dynamic state flags for ext4 */
576
577 ext4_lblk_t i_dir_start_lookup;
578#ifdef CONFIG_EXT4_FS_XATTR
579 /*
580 * Extended attributes can be read independently of the main file
581 * data. Taking i_mutex even when reading would cause contention
582 * between readers of EAs and writers of regular file data, so
583 * instead we synchronize on xattr_sem when reading or changing
584 * EAs.
585 */
586 struct rw_semaphore xattr_sem;
587#endif
588#ifdef CONFIG_EXT4_FS_POSIX_ACL
589 struct posix_acl *i_acl;
590 struct posix_acl *i_default_acl;
591#endif
592
593 struct list_head i_orphan; /* unlinked but open inodes */
594
595 /*
596 * i_disksize keeps track of what the inode size is ON DISK, not
597 * in memory. During truncate, i_size is set to the new size by
598 * the VFS prior to calling ext4_truncate(), but the filesystem won't
599 * set i_disksize to 0 until the truncate is actually under way.
600 *
601 * The intent is that i_disksize always represents the blocks which
602 * are used by this file. This allows recovery to restart truncate
603 * on orphans if we crash during truncate. We actually write i_disksize
604 * into the on-disk inode when writing inodes out, instead of i_size.
605 *
606 * The only time when i_disksize and i_size may be different is when
607 * a truncate is in progress. The only things which change i_disksize
608 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
609 */
610 loff_t i_disksize;
611
612 /*
613 * i_data_sem is for serialising ext4_truncate() against
614 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
615 * data tree are chopped off during truncate. We can't do that in
616 * ext4 because whenever we perform intermediate commits during
617 * truncate, the inode and all the metadata blocks *must* be in a
618 * consistent state which allows truncation of the orphans to restart
619 * during recovery. Hence we must fix the get_block-vs-truncate race
620 * by other means, so we have i_data_sem.
621 */
622 struct rw_semaphore i_data_sem;
623 struct inode vfs_inode;
624 struct jbd2_inode jinode;
625
626 struct ext4_ext_cache i_cached_extent;
627 /*
628 * File creation time. Its function is same as that of
629 * struct timespec i_{a,c,m}time in the generic inode.
630 */
631 struct timespec i_crtime;
632
633 /* mballoc */
634 struct list_head i_prealloc_list;
635 spinlock_t i_prealloc_lock;
636
637 /* ialloc */
638 ext4_group_t i_last_alloc_group;
639
640 /* allocation reservation info for delalloc */
641 unsigned int i_reserved_data_blocks;
642 unsigned int i_reserved_meta_blocks;
643 unsigned int i_allocated_meta_blocks;
644 unsigned short i_delalloc_reserved_flag;
645
646 /* on-disk additional length */
647 __u16 i_extra_isize;
648
649 spinlock_t i_block_reservation_lock;
650};
651
652/*
519 * File system states 653 * File system states
520 */ 654 */
521#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ 655#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
@@ -560,6 +694,7 @@ do { \
560#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 694#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
561#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 695#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
562#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 696#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
697#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
563 698
564/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 699/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
565#ifndef _LINUX_EXT2_FS_H 700#ifndef _LINUX_EXT2_FS_H
@@ -689,6 +824,137 @@ struct ext4_super_block {
689}; 824};
690 825
691#ifdef __KERNEL__ 826#ifdef __KERNEL__
827/*
828 * fourth extended-fs super-block data in memory
829 */
830struct ext4_sb_info {
831 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
832 unsigned long s_inodes_per_block;/* Number of inodes per block */
833 unsigned long s_blocks_per_group;/* Number of blocks in a group */
834 unsigned long s_inodes_per_group;/* Number of inodes in a group */
835 unsigned long s_itb_per_group; /* Number of inode table blocks per group */
836 unsigned long s_gdb_count; /* Number of group descriptor blocks */
837 unsigned long s_desc_per_block; /* Number of group descriptors per block */
838 ext4_group_t s_groups_count; /* Number of groups in the fs */
839 unsigned long s_overhead_last; /* Last calculated overhead */
840 unsigned long s_blocks_last; /* Last seen block count */
841 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
842 struct buffer_head * s_sbh; /* Buffer containing the super block */
843 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
844 struct buffer_head **s_group_desc;
845 unsigned long s_mount_opt;
846 ext4_fsblk_t s_sb_block;
847 uid_t s_resuid;
848 gid_t s_resgid;
849 unsigned short s_mount_state;
850 unsigned short s_pad;
851 int s_addr_per_block_bits;
852 int s_desc_per_block_bits;
853 int s_inode_size;
854 int s_first_ino;
855 unsigned int s_inode_readahead_blks;
856 spinlock_t s_next_gen_lock;
857 u32 s_next_generation;
858 u32 s_hash_seed[4];
859 int s_def_hash_version;
860 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
861 struct percpu_counter s_freeblocks_counter;
862 struct percpu_counter s_freeinodes_counter;
863 struct percpu_counter s_dirs_counter;
864 struct percpu_counter s_dirtyblocks_counter;
865 struct blockgroup_lock *s_blockgroup_lock;
866 struct proc_dir_entry *s_proc;
867 struct kobject s_kobj;
868 struct completion s_kobj_unregister;
869
870 /* Journaling */
871 struct inode *s_journal_inode;
872 struct journal_s *s_journal;
873 struct list_head s_orphan;
874 struct mutex s_orphan_lock;
875 struct mutex s_resize_lock;
876 unsigned long s_commit_interval;
877 u32 s_max_batch_time;
878 u32 s_min_batch_time;
879 struct block_device *journal_bdev;
880#ifdef CONFIG_JBD2_DEBUG
881 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
882 wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
883#endif
884#ifdef CONFIG_QUOTA
885 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
886 int s_jquota_fmt; /* Format of quota to use */
887#endif
888 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
889 struct rb_root system_blks;
890
891#ifdef EXTENTS_STATS
892 /* ext4 extents stats */
893 unsigned long s_ext_min;
894 unsigned long s_ext_max;
895 unsigned long s_depth_max;
896 spinlock_t s_ext_stats_lock;
897 unsigned long s_ext_blocks;
898 unsigned long s_ext_extents;
899#endif
900
901 /* for buddy allocator */
902 struct ext4_group_info ***s_group_info;
903 struct inode *s_buddy_cache;
904 long s_blocks_reserved;
905 spinlock_t s_reserve_lock;
906 spinlock_t s_md_lock;
907 tid_t s_last_transaction;
908 unsigned short *s_mb_offsets;
909 unsigned int *s_mb_maxs;
910
911 /* tunables */
912 unsigned long s_stripe;
913 unsigned int s_mb_stream_request;
914 unsigned int s_mb_max_to_scan;
915 unsigned int s_mb_min_to_scan;
916 unsigned int s_mb_stats;
917 unsigned int s_mb_order2_reqs;
918 unsigned int s_mb_group_prealloc;
919 /* where last allocation was done - for stream allocation */
920 unsigned long s_mb_last_group;
921 unsigned long s_mb_last_start;
922
923 /* history to debug policy */
924 struct ext4_mb_history *s_mb_history;
925 int s_mb_history_cur;
926 int s_mb_history_max;
927 int s_mb_history_num;
928 spinlock_t s_mb_history_lock;
929 int s_mb_history_filter;
930
931 /* stats for buddy allocator */
932 spinlock_t s_mb_pa_lock;
933 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
934 atomic_t s_bal_success; /* we found long enough chunks */
935 atomic_t s_bal_allocated; /* in blocks */
936 atomic_t s_bal_ex_scanned; /* total extents scanned */
937 atomic_t s_bal_goals; /* goal hits */
938 atomic_t s_bal_breaks; /* too long searches */
939 atomic_t s_bal_2orders; /* 2^order hits */
940 spinlock_t s_bal_lock;
941 unsigned long s_mb_buddies_generated;
942 unsigned long long s_mb_generation_time;
943 atomic_t s_mb_lost_chunks;
944 atomic_t s_mb_preallocated;
945 atomic_t s_mb_discarded;
946
947 /* locality groups */
948 struct ext4_locality_group *s_locality_groups;
949
950 /* for write statistics */
951 unsigned long s_sectors_written_start;
952 u64 s_kbytes_written;
953
954 unsigned int s_log_groups_per_flex;
955 struct flex_groups *s_flex_groups;
956};
957
692static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 958static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
693{ 959{
694 return sb->s_fs_info; 960 return sb->s_fs_info;
@@ -704,7 +970,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
704 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; 970 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
705} 971}
706 972
707
708static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) 973static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
709{ 974{
710 return ino == EXT4_ROOT_INO || 975 return ino == EXT4_ROOT_INO ||
@@ -1014,6 +1279,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
1014 ext4_group_t block_group, 1279 ext4_group_t block_group,
1015 struct buffer_head ** bh); 1280 struct buffer_head ** bh);
1016extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1281extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1282struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1283 ext4_group_t block_group);
1284extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1285 struct buffer_head *bh,
1286 ext4_group_t group,
1287 struct ext4_group_desc *desc);
1288#define ext4_free_blocks_after_init(sb, group, desc) \
1289 ext4_init_block_bitmap(sb, NULL, group, desc)
1017 1290
1018/* dir.c */ 1291/* dir.c */
1019extern int ext4_check_dir_entry(const char *, struct inode *, 1292extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1038,6 +1311,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1038extern unsigned long ext4_count_free_inodes(struct super_block *); 1311extern unsigned long ext4_count_free_inodes(struct super_block *);
1039extern unsigned long ext4_count_dirs(struct super_block *); 1312extern unsigned long ext4_count_dirs(struct super_block *);
1040extern void ext4_check_inodes_bitmap(struct super_block *); 1313extern void ext4_check_inodes_bitmap(struct super_block *);
1314extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
1315 struct buffer_head *bh,
1316 ext4_group_t group,
1317 struct ext4_group_desc *desc);
1318extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1041 1319
1042/* mballoc.c */ 1320/* mballoc.c */
1043extern long ext4_mb_stats; 1321extern long ext4_mb_stats;
@@ -1123,6 +1401,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1123 __attribute__ ((format (printf, 3, 4))); 1401 __attribute__ ((format (printf, 3, 4)));
1124extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1402extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1125 __attribute__ ((format (printf, 3, 4))); 1403 __attribute__ ((format (printf, 3, 4)));
1404extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1405 __attribute__ ((format (printf, 3, 4)));
1126extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1406extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
1127 const char *, const char *, ...) 1407 const char *, const char *, ...)
1128 __attribute__ ((format (printf, 4, 5))); 1408 __attribute__ ((format (printf, 4, 5)));
@@ -1161,6 +1441,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
1161 struct ext4_group_desc *bg, __u32 count); 1441 struct ext4_group_desc *bg, __u32 count);
1162extern void ext4_itable_unused_set(struct super_block *sb, 1442extern void ext4_itable_unused_set(struct super_block *sb,
1163 struct ext4_group_desc *bg, __u32 count); 1443 struct ext4_group_desc *bg, __u32 count);
1444extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
1445 struct ext4_group_desc *gdp);
1446extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
1447 struct ext4_group_desc *gdp);
1164 1448
1165static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 1449static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
1166{ 1450{
@@ -1228,6 +1512,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1228 return grp_info[indexv][indexh]; 1512 return grp_info[indexv][indexh];
1229} 1513}
1230 1514
1515/*
1516 * Reading s_groups_count requires using smp_rmb() afterwards. See
1517 * the locking protocol documented in the comments of ext4_group_add()
1518 * in resize.c
1519 */
1520static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
1521{
1522 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
1523
1524 smp_rmb();
1525 return ngroups;
1526}
1231 1527
1232static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, 1528static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
1233 ext4_group_t block_group) 1529 ext4_group_t block_group)
@@ -1283,33 +1579,25 @@ struct ext4_group_info {
1283}; 1579};
1284 1580
1285#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 1581#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
1286#define EXT4_GROUP_INFO_LOCKED_BIT 1
1287 1582
1288#define EXT4_MB_GRP_NEED_INIT(grp) \ 1583#define EXT4_MB_GRP_NEED_INIT(grp) \
1289 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 1584 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1290 1585
1291static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 1586static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
1587 ext4_group_t group)
1292{ 1588{
1293 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); 1589 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
1294
1295 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1296} 1590}
1297 1591
1298static inline void ext4_unlock_group(struct super_block *sb, 1592static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1299 ext4_group_t group)
1300{ 1593{
1301 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); 1594 spin_lock(ext4_group_lock_ptr(sb, group));
1302
1303 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1304} 1595}
1305 1596
1306static inline int ext4_is_group_locked(struct super_block *sb, 1597static inline void ext4_unlock_group(struct super_block *sb,
1307 ext4_group_t group) 1598 ext4_group_t group)
1308{ 1599{
1309 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); 1600 spin_unlock(ext4_group_lock_ptr(sb, group));
1310
1311 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
1312 &(grinfo->bb_state));
1313} 1601}
1314 1602
1315/* 1603/*
@@ -1326,11 +1614,21 @@ extern const struct file_operations ext4_file_operations;
1326/* namei.c */ 1614/* namei.c */
1327extern const struct inode_operations ext4_dir_inode_operations; 1615extern const struct inode_operations ext4_dir_inode_operations;
1328extern const struct inode_operations ext4_special_inode_operations; 1616extern const struct inode_operations ext4_special_inode_operations;
1617extern struct dentry *ext4_get_parent(struct dentry *child);
1329 1618
1330/* symlink.c */ 1619/* symlink.c */
1331extern const struct inode_operations ext4_symlink_inode_operations; 1620extern const struct inode_operations ext4_symlink_inode_operations;
1332extern const struct inode_operations ext4_fast_symlink_inode_operations; 1621extern const struct inode_operations ext4_fast_symlink_inode_operations;
1333 1622
1623/* block_validity */
1624extern void ext4_release_system_zone(struct super_block *sb);
1625extern int ext4_setup_system_zone(struct super_block *sb);
1626extern int __init init_ext4_system_zone(void);
1627extern void exit_ext4_system_zone(void);
1628extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
1629 ext4_fsblk_t start_blk,
1630 unsigned int count);
1631
1334/* extents.c */ 1632/* extents.c */
1335extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 1633extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1336extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1634extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
@@ -1338,17 +1636,15 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1338 int chunk); 1636 int chunk);
1339extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1637extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1340 ext4_lblk_t iblock, unsigned int max_blocks, 1638 ext4_lblk_t iblock, unsigned int max_blocks,
1341 struct buffer_head *bh_result, 1639 struct buffer_head *bh_result, int flags);
1342 int create, int extend_disksize);
1343extern void ext4_ext_truncate(struct inode *); 1640extern void ext4_ext_truncate(struct inode *);
1344extern void ext4_ext_init(struct super_block *); 1641extern void ext4_ext_init(struct super_block *);
1345extern void ext4_ext_release(struct super_block *); 1642extern void ext4_ext_release(struct super_block *);
1346extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1643extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1347 loff_t len); 1644 loff_t len);
1348extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, 1645extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1349 sector_t block, unsigned int max_blocks, 1646 sector_t block, unsigned int max_blocks,
1350 struct buffer_head *bh, int create, 1647 struct buffer_head *bh, int flags);
1351 int extend_disksize, int flag);
1352extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1648extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1353 __u64 start, __u64 len); 1649 __u64 start, __u64 len);
1354 1650
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644
index 4ce2187123aa..000000000000
--- a/fs/ext4/ext4_i.h
+++ /dev/null
@@ -1,140 +0,0 @@
1/*
2 * ext4_i.h
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/include/linux/minix_fs_i.h
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 */
15
16#ifndef _EXT4_I
17#define _EXT4_I
18
19#include <linux/rwsem.h>
20#include <linux/rbtree.h>
21#include <linux/seqlock.h>
22#include <linux/mutex.h>
23
24/* data type for block offset of block group */
25typedef int ext4_grpblk_t;
26
27/* data type for filesystem-wide blocks number */
28typedef unsigned long long ext4_fsblk_t;
29
30/* data type for file logical block number */
31typedef __u32 ext4_lblk_t;
32
33/* data type for block group number */
34typedef unsigned int ext4_group_t;
35
36/*
37 * storage for cached extent
38 */
39struct ext4_ext_cache {
40 ext4_fsblk_t ec_start;
41 ext4_lblk_t ec_block;
42 __u32 ec_len; /* must be 32bit to return holes */
43 __u32 ec_type;
44};
45
46/*
47 * fourth extended file system inode data in memory
48 */
49struct ext4_inode_info {
50 __le32 i_data[15]; /* unconverted */
51 __u32 i_flags;
52 ext4_fsblk_t i_file_acl;
53 __u32 i_dtime;
54
55 /*
56 * i_block_group is the number of the block group which contains
57 * this file's inode. Constant across the lifetime of the inode,
58 * it is ued for making block allocation decisions - we try to
59 * place a file's data blocks near its inode block, and new inodes
60 * near to their parent directory's inode.
61 */
62 ext4_group_t i_block_group;
63 __u32 i_state; /* Dynamic state flags for ext4 */
64
65 ext4_lblk_t i_dir_start_lookup;
66#ifdef CONFIG_EXT4_FS_XATTR
67 /*
68 * Extended attributes can be read independently of the main file
69 * data. Taking i_mutex even when reading would cause contention
70 * between readers of EAs and writers of regular file data, so
71 * instead we synchronize on xattr_sem when reading or changing
72 * EAs.
73 */
74 struct rw_semaphore xattr_sem;
75#endif
76#ifdef CONFIG_EXT4_FS_POSIX_ACL
77 struct posix_acl *i_acl;
78 struct posix_acl *i_default_acl;
79#endif
80
81 struct list_head i_orphan; /* unlinked but open inodes */
82
83 /*
84 * i_disksize keeps track of what the inode size is ON DISK, not
85 * in memory. During truncate, i_size is set to the new size by
86 * the VFS prior to calling ext4_truncate(), but the filesystem won't
87 * set i_disksize to 0 until the truncate is actually under way.
88 *
89 * The intent is that i_disksize always represents the blocks which
90 * are used by this file. This allows recovery to restart truncate
91 * on orphans if we crash during truncate. We actually write i_disksize
92 * into the on-disk inode when writing inodes out, instead of i_size.
93 *
94 * The only time when i_disksize and i_size may be different is when
95 * a truncate is in progress. The only things which change i_disksize
96 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
97 */
98 loff_t i_disksize;
99
100 /*
101 * i_data_sem is for serialising ext4_truncate() against
102 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
103 * data tree are chopped off during truncate. We can't do that in
104 * ext4 because whenever we perform intermediate commits during
105 * truncate, the inode and all the metadata blocks *must* be in a
106 * consistent state which allows truncation of the orphans to restart
107 * during recovery. Hence we must fix the get_block-vs-truncate race
108 * by other means, so we have i_data_sem.
109 */
110 struct rw_semaphore i_data_sem;
111 struct inode vfs_inode;
112 struct jbd2_inode jinode;
113
114 struct ext4_ext_cache i_cached_extent;
115 /*
116 * File creation time. Its function is same as that of
117 * struct timespec i_{a,c,m}time in the generic inode.
118 */
119 struct timespec i_crtime;
120
121 /* mballoc */
122 struct list_head i_prealloc_list;
123 spinlock_t i_prealloc_lock;
124
125 /* ialloc */
126 ext4_group_t i_last_alloc_group;
127
128 /* allocation reservation info for delalloc */
129 unsigned int i_reserved_data_blocks;
130 unsigned int i_reserved_meta_blocks;
131 unsigned int i_allocated_meta_blocks;
132 unsigned short i_delalloc_reserved_flag;
133
134 /* on-disk additional length */
135 __u16 i_extra_isize;
136
137 spinlock_t i_block_reservation_lock;
138};
139
140#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644
index 57b71fefbccf..000000000000
--- a/fs/ext4/ext4_sb.h
+++ /dev/null
@@ -1,161 +0,0 @@
1/*
2 * ext4_sb.h
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/include/linux/minix_fs_sb.h
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 */
15
16#ifndef _EXT4_SB
17#define _EXT4_SB
18
19#ifdef __KERNEL__
20#include <linux/timer.h>
21#include <linux/wait.h>
22#include <linux/blockgroup_lock.h>
23#include <linux/percpu_counter.h>
24#endif
25#include <linux/rbtree.h>
26
27/*
28 * fourth extended-fs super-block data in memory
29 */
30struct ext4_sb_info {
31 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
32 unsigned long s_inodes_per_block;/* Number of inodes per block */
33 unsigned long s_blocks_per_group;/* Number of blocks in a group */
34 unsigned long s_inodes_per_group;/* Number of inodes in a group */
35 unsigned long s_itb_per_group; /* Number of inode table blocks per group */
36 unsigned long s_gdb_count; /* Number of group descriptor blocks */
37 unsigned long s_desc_per_block; /* Number of group descriptors per block */
38 ext4_group_t s_groups_count; /* Number of groups in the fs */
39 unsigned long s_overhead_last; /* Last calculated overhead */
40 unsigned long s_blocks_last; /* Last seen block count */
41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
42 struct buffer_head * s_sbh; /* Buffer containing the super block */
43 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
44 struct buffer_head **s_group_desc;
45 unsigned long s_mount_opt;
46 ext4_fsblk_t s_sb_block;
47 uid_t s_resuid;
48 gid_t s_resgid;
49 unsigned short s_mount_state;
50 unsigned short s_pad;
51 int s_addr_per_block_bits;
52 int s_desc_per_block_bits;
53 int s_inode_size;
54 int s_first_ino;
55 unsigned int s_inode_readahead_blks;
56 spinlock_t s_next_gen_lock;
57 u32 s_next_generation;
58 u32 s_hash_seed[4];
59 int s_def_hash_version;
60 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
61 struct percpu_counter s_freeblocks_counter;
62 struct percpu_counter s_freeinodes_counter;
63 struct percpu_counter s_dirs_counter;
64 struct percpu_counter s_dirtyblocks_counter;
65 struct blockgroup_lock *s_blockgroup_lock;
66 struct proc_dir_entry *s_proc;
67 struct kobject s_kobj;
68 struct completion s_kobj_unregister;
69
70 /* Journaling */
71 struct inode *s_journal_inode;
72 struct journal_s *s_journal;
73 struct list_head s_orphan;
74 unsigned long s_commit_interval;
75 u32 s_max_batch_time;
76 u32 s_min_batch_time;
77 struct block_device *journal_bdev;
78#ifdef CONFIG_JBD2_DEBUG
79 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
80 wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
81#endif
82#ifdef CONFIG_QUOTA
83 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
84 int s_jquota_fmt; /* Format of quota to use */
85#endif
86 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
87
88#ifdef EXTENTS_STATS
89 /* ext4 extents stats */
90 unsigned long s_ext_min;
91 unsigned long s_ext_max;
92 unsigned long s_depth_max;
93 spinlock_t s_ext_stats_lock;
94 unsigned long s_ext_blocks;
95 unsigned long s_ext_extents;
96#endif
97
98 /* for buddy allocator */
99 struct ext4_group_info ***s_group_info;
100 struct inode *s_buddy_cache;
101 long s_blocks_reserved;
102 spinlock_t s_reserve_lock;
103 spinlock_t s_md_lock;
104 tid_t s_last_transaction;
105 unsigned short *s_mb_offsets;
106 unsigned int *s_mb_maxs;
107
108 /* tunables */
109 unsigned long s_stripe;
110 unsigned int s_mb_stream_request;
111 unsigned int s_mb_max_to_scan;
112 unsigned int s_mb_min_to_scan;
113 unsigned int s_mb_stats;
114 unsigned int s_mb_order2_reqs;
115 unsigned int s_mb_group_prealloc;
116 /* where last allocation was done - for stream allocation */
117 unsigned long s_mb_last_group;
118 unsigned long s_mb_last_start;
119
120 /* history to debug policy */
121 struct ext4_mb_history *s_mb_history;
122 int s_mb_history_cur;
123 int s_mb_history_max;
124 int s_mb_history_num;
125 spinlock_t s_mb_history_lock;
126 int s_mb_history_filter;
127
128 /* stats for buddy allocator */
129 spinlock_t s_mb_pa_lock;
130 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
131 atomic_t s_bal_success; /* we found long enough chunks */
132 atomic_t s_bal_allocated; /* in blocks */
133 atomic_t s_bal_ex_scanned; /* total extents scanned */
134 atomic_t s_bal_goals; /* goal hits */
135 atomic_t s_bal_breaks; /* too long searches */
136 atomic_t s_bal_2orders; /* 2^order hits */
137 spinlock_t s_bal_lock;
138 unsigned long s_mb_buddies_generated;
139 unsigned long long s_mb_generation_time;
140 atomic_t s_mb_lost_chunks;
141 atomic_t s_mb_preallocated;
142 atomic_t s_mb_discarded;
143
144 /* locality groups */
145 struct ext4_locality_group *s_locality_groups;
146
147 /* for write statistics */
148 unsigned long s_sectors_written_start;
149 u64 s_kbytes_written;
150
151 unsigned int s_log_groups_per_flex;
152 struct flex_groups *s_flex_groups;
153};
154
155static inline spinlock_t *
156sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
157{
158 return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
159}
160
161#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e3a55eb8b26a..2593f748c3a4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
326 326
327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328{ 328{
329 ext4_fsblk_t block = ext_pblock(ext), valid_block; 329 ext4_fsblk_t block = ext_pblock(ext);
330 int len = ext4_ext_get_actual_len(ext); 330 int len = ext4_ext_get_actual_len(ext);
331 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
332 331
333 valid_block = le32_to_cpu(es->s_first_data_block) + 332 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
334 EXT4_SB(inode->i_sb)->s_gdb_count;
335 if (unlikely(block <= valid_block ||
336 ((block + len) > ext4_blocks_count(es))))
337 return 0;
338 else
339 return 1;
340} 333}
341 334
342static int ext4_valid_extent_idx(struct inode *inode, 335static int ext4_valid_extent_idx(struct inode *inode,
343 struct ext4_extent_idx *ext_idx) 336 struct ext4_extent_idx *ext_idx)
344{ 337{
345 ext4_fsblk_t block = idx_pblock(ext_idx), valid_block; 338 ext4_fsblk_t block = idx_pblock(ext_idx);
346 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
347 339
348 valid_block = le32_to_cpu(es->s_first_data_block) + 340 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
349 EXT4_SB(inode->i_sb)->s_gdb_count;
350 if (unlikely(block <= valid_block ||
351 (block >= ext4_blocks_count(es))))
352 return 0;
353 else
354 return 1;
355} 341}
356 342
357static int ext4_valid_extent_entries(struct inode *inode, 343static int ext4_valid_extent_entries(struct inode *inode,
@@ -2097,12 +2083,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2097 ex = EXT_LAST_EXTENT(eh); 2083 ex = EXT_LAST_EXTENT(eh);
2098 2084
2099 ex_ee_block = le32_to_cpu(ex->ee_block); 2085 ex_ee_block = le32_to_cpu(ex->ee_block);
2100 if (ext4_ext_is_uninitialized(ex))
2101 uninitialized = 1;
2102 ex_ee_len = ext4_ext_get_actual_len(ex); 2086 ex_ee_len = ext4_ext_get_actual_len(ex);
2103 2087
2104 while (ex >= EXT_FIRST_EXTENT(eh) && 2088 while (ex >= EXT_FIRST_EXTENT(eh) &&
2105 ex_ee_block + ex_ee_len > start) { 2089 ex_ee_block + ex_ee_len > start) {
2090
2091 if (ext4_ext_is_uninitialized(ex))
2092 uninitialized = 1;
2093 else
2094 uninitialized = 0;
2095
2106 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); 2096 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
2107 path[depth].p_ext = ex; 2097 path[depth].p_ext = ex;
2108 2098
@@ -2784,7 +2774,7 @@ fix_extent_len:
2784int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2774int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2785 ext4_lblk_t iblock, 2775 ext4_lblk_t iblock,
2786 unsigned int max_blocks, struct buffer_head *bh_result, 2776 unsigned int max_blocks, struct buffer_head *bh_result,
2787 int create, int extend_disksize) 2777 int flags)
2788{ 2778{
2789 struct ext4_ext_path *path = NULL; 2779 struct ext4_ext_path *path = NULL;
2790 struct ext4_extent_header *eh; 2780 struct ext4_extent_header *eh;
@@ -2793,7 +2783,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2793 int err = 0, depth, ret, cache_type; 2783 int err = 0, depth, ret, cache_type;
2794 unsigned int allocated = 0; 2784 unsigned int allocated = 0;
2795 struct ext4_allocation_request ar; 2785 struct ext4_allocation_request ar;
2796 loff_t disksize;
2797 2786
2798 __clear_bit(BH_New, &bh_result->b_state); 2787 __clear_bit(BH_New, &bh_result->b_state);
2799 ext_debug("blocks %u/%u requested for inode %u\n", 2788 ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2803,7 +2792,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2803 cache_type = ext4_ext_in_cache(inode, iblock, &newex); 2792 cache_type = ext4_ext_in_cache(inode, iblock, &newex);
2804 if (cache_type) { 2793 if (cache_type) {
2805 if (cache_type == EXT4_EXT_CACHE_GAP) { 2794 if (cache_type == EXT4_EXT_CACHE_GAP) {
2806 if (!create) { 2795 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
2807 /* 2796 /*
2808 * block isn't allocated yet and 2797 * block isn't allocated yet and
2809 * user doesn't want to allocate it 2798 * user doesn't want to allocate it
@@ -2869,9 +2858,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2869 EXT4_EXT_CACHE_EXTENT); 2858 EXT4_EXT_CACHE_EXTENT);
2870 goto out; 2859 goto out;
2871 } 2860 }
2872 if (create == EXT4_CREATE_UNINITIALIZED_EXT) 2861 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
2873 goto out; 2862 goto out;
2874 if (!create) { 2863 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
2864 if (allocated > max_blocks)
2865 allocated = max_blocks;
2875 /* 2866 /*
2876 * We have blocks reserved already. We 2867 * We have blocks reserved already. We
2877 * return allocated blocks so that delalloc 2868 * return allocated blocks so that delalloc
@@ -2879,8 +2870,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2879 * the buffer head will be unmapped so that 2870 * the buffer head will be unmapped so that
2880 * a read from the block returns 0s. 2871 * a read from the block returns 0s.
2881 */ 2872 */
2882 if (allocated > max_blocks)
2883 allocated = max_blocks;
2884 set_buffer_unwritten(bh_result); 2873 set_buffer_unwritten(bh_result);
2885 bh_result->b_bdev = inode->i_sb->s_bdev; 2874 bh_result->b_bdev = inode->i_sb->s_bdev;
2886 bh_result->b_blocknr = newblock; 2875 bh_result->b_blocknr = newblock;
@@ -2903,7 +2892,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2903 * requested block isn't allocated yet; 2892 * requested block isn't allocated yet;
2904 * we couldn't try to create block if create flag is zero 2893 * we couldn't try to create block if create flag is zero
2905 */ 2894 */
2906 if (!create) { 2895 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
2907 /* 2896 /*
2908 * put just found gap into cache to speed up 2897 * put just found gap into cache to speed up
2909 * subsequent requests 2898 * subsequent requests
@@ -2932,10 +2921,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2932 * EXT_UNINIT_MAX_LEN. 2921 * EXT_UNINIT_MAX_LEN.
2933 */ 2922 */
2934 if (max_blocks > EXT_INIT_MAX_LEN && 2923 if (max_blocks > EXT_INIT_MAX_LEN &&
2935 create != EXT4_CREATE_UNINITIALIZED_EXT) 2924 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
2936 max_blocks = EXT_INIT_MAX_LEN; 2925 max_blocks = EXT_INIT_MAX_LEN;
2937 else if (max_blocks > EXT_UNINIT_MAX_LEN && 2926 else if (max_blocks > EXT_UNINIT_MAX_LEN &&
2938 create == EXT4_CREATE_UNINITIALIZED_EXT) 2927 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
2939 max_blocks = EXT_UNINIT_MAX_LEN; 2928 max_blocks = EXT_UNINIT_MAX_LEN;
2940 2929
2941 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 2930 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2955,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2966 /* try to insert new extent into found leaf and return */ 2955 /* try to insert new extent into found leaf and return */
2967 ext4_ext_store_pblock(&newex, newblock); 2956 ext4_ext_store_pblock(&newex, newblock);
2968 newex.ee_len = cpu_to_le16(ar.len); 2957 newex.ee_len = cpu_to_le16(ar.len);
2969 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ 2958 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
2970 ext4_ext_mark_uninitialized(&newex); 2959 ext4_ext_mark_uninitialized(&newex);
2971 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2960 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2972 if (err) { 2961 if (err) {
@@ -2983,18 +2972,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2983 newblock = ext_pblock(&newex); 2972 newblock = ext_pblock(&newex);
2984 allocated = ext4_ext_get_actual_len(&newex); 2973 allocated = ext4_ext_get_actual_len(&newex);
2985outnew: 2974outnew:
2986 if (extend_disksize) {
2987 disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
2988 if (disksize > i_size_read(inode))
2989 disksize = i_size_read(inode);
2990 if (disksize > EXT4_I(inode)->i_disksize)
2991 EXT4_I(inode)->i_disksize = disksize;
2992 }
2993
2994 set_buffer_new(bh_result); 2975 set_buffer_new(bh_result);
2995 2976
2996 /* Cache only when it is _not_ an uninitialized extent */ 2977 /* Cache only when it is _not_ an uninitialized extent */
2997 if (create != EXT4_CREATE_UNINITIALIZED_EXT) 2978 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
2998 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 2979 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2999 EXT4_EXT_CACHE_EXTENT); 2980 EXT4_EXT_CACHE_EXTENT);
3000out: 2981out:
@@ -3150,9 +3131,10 @@ retry:
3150 ret = PTR_ERR(handle); 3131 ret = PTR_ERR(handle);
3151 break; 3132 break;
3152 } 3133 }
3153 ret = ext4_get_blocks_wrap(handle, inode, block, 3134 map_bh.b_state = 0;
3154 max_blocks, &map_bh, 3135 ret = ext4_get_blocks(handle, inode, block,
3155 EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); 3136 max_blocks, &map_bh,
3137 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3156 if (ret <= 0) { 3138 if (ret <= 0) {
3157#ifdef EXT4FS_DEBUG 3139#ifdef EXT4FS_DEBUG
3158 WARN_ON(ret <= 0); 3140 WARN_ON(ret <= 0);
@@ -3195,7 +3177,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3195 void *data) 3177 void *data)
3196{ 3178{
3197 struct fiemap_extent_info *fieinfo = data; 3179 struct fiemap_extent_info *fieinfo = data;
3198 unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; 3180 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3199 __u64 logical; 3181 __u64 logical;
3200 __u64 physical; 3182 __u64 physical;
3201 __u64 length; 3183 __u64 length;
@@ -3242,9 +3224,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3242 * 3224 *
3243 * XXX this might miss a single-block extent at EXT_MAX_BLOCK 3225 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3244 */ 3226 */
3245 if (logical + length - 1 == EXT_MAX_BLOCK || 3227 if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3246 ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) 3228 newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3229 loff_t size = i_size_read(inode);
3230 loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3231
3247 flags |= FIEMAP_EXTENT_LAST; 3232 flags |= FIEMAP_EXTENT_LAST;
3233 if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3234 logical+length > size)
3235 length = (size - logical + bs - 1) & ~(bs-1);
3236 }
3248 3237
3249 error = fiemap_fill_next_extent(fieinfo, logical, physical, 3238 error = fiemap_fill_next_extent(fieinfo, logical, physical,
3250 length, flags); 3239 length, flags);
@@ -3318,10 +3307,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3318 * Walk the extent tree gathering extent information. 3307 * Walk the extent tree gathering extent information.
3319 * ext4_ext_fiemap_cb will push extents back to user. 3308 * ext4_ext_fiemap_cb will push extents back to user.
3320 */ 3309 */
3321 down_write(&EXT4_I(inode)->i_data_sem); 3310 down_read(&EXT4_I(inode)->i_data_sem);
3322 error = ext4_ext_walk_space(inode, start_blk, len_blks, 3311 error = ext4_ext_walk_space(inode, start_blk, len_blks,
3323 ext4_ext_fiemap_cb, fieinfo); 3312 ext4_ext_fiemap_cb, fieinfo);
3324 up_write(&EXT4_I(inode)->i_data_sem); 3313 up_read(&EXT4_I(inode)->i_data_sem);
3325 } 3314 }
3326 3315
3327 return error; 3316 return error;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index c2c0a8d06d0e..000000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * linux/fs/ext4/group.h
3 *
4 * Copyright (C) 2007 Cluster File Systems, Inc
5 *
6 * Author: Andreas Dilger <adilger@clusterfs.com>
7 */
8
9#ifndef _LINUX_EXT4_GROUP_H
10#define _LINUX_EXT4_GROUP_H
11
12extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
13 struct ext4_group_desc *gdp);
14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
15 struct ext4_group_desc *gdp);
16struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
17 ext4_group_t block_group);
18extern unsigned ext4_init_block_bitmap(struct super_block *sb,
19 struct buffer_head *bh,
20 ext4_group_t group,
21 struct ext4_group_desc *desc);
22#define ext4_free_blocks_after_init(sb, group, desc) \
23 ext4_init_block_bitmap(sb, NULL, group, desc)
24extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
25 struct buffer_head *bh,
26 ext4_group_t group,
27 struct ext4_group_desc *desc);
28extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
29#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a08a6b5..3743bd849bce 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -27,7 +27,6 @@
27#include "ext4_jbd2.h" 27#include "ext4_jbd2.h"
28#include "xattr.h" 28#include "xattr.h"
29#include "acl.h" 29#include "acl.h"
30#include "group.h"
31 30
32/* 31/*
33 * ialloc.c contains the inodes allocation and deallocation routines 32 * ialloc.c contains the inodes allocation and deallocation routines
@@ -123,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
123 unlock_buffer(bh); 122 unlock_buffer(bh);
124 return bh; 123 return bh;
125 } 124 }
126 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 125 ext4_lock_group(sb, block_group);
127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 126 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
128 ext4_init_inode_bitmap(sb, bh, block_group, desc); 127 ext4_init_inode_bitmap(sb, bh, block_group, desc);
129 set_bitmap_uptodate(bh); 128 set_bitmap_uptodate(bh);
130 set_buffer_uptodate(bh); 129 set_buffer_uptodate(bh);
131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 130 ext4_unlock_group(sb, block_group);
132 unlock_buffer(bh); 131 unlock_buffer(bh);
133 return bh; 132 return bh;
134 } 133 }
135 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 134 ext4_unlock_group(sb, block_group);
136 if (buffer_uptodate(bh)) { 135 if (buffer_uptodate(bh)) {
137 /* 136 /*
138 * if not uninit if bh is uptodate, 137 * if not uninit if bh is uptodate,
@@ -247,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
247 goto error_return; 246 goto error_return;
248 247
249 /* Ok, now we can actually update the inode bitmaps.. */ 248 /* Ok, now we can actually update the inode bitmaps.. */
250 spin_lock(sb_bgl_lock(sbi, block_group)); 249 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
251 cleared = ext4_clear_bit(bit, bitmap_bh->b_data); 250 bit, bitmap_bh->b_data);
252 spin_unlock(sb_bgl_lock(sbi, block_group));
253 if (!cleared) 251 if (!cleared)
254 ext4_error(sb, "ext4_free_inode", 252 ext4_error(sb, "ext4_free_inode",
255 "bit already cleared for inode %lu", ino); 253 "bit already cleared for inode %lu", ino);
@@ -261,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
261 if (fatal) goto error_return; 259 if (fatal) goto error_return;
262 260
263 if (gdp) { 261 if (gdp) {
264 spin_lock(sb_bgl_lock(sbi, block_group)); 262 ext4_lock_group(sb, block_group);
265 count = ext4_free_inodes_count(sb, gdp) + 1; 263 count = ext4_free_inodes_count(sb, gdp) + 1;
266 ext4_free_inodes_set(sb, gdp, count); 264 ext4_free_inodes_set(sb, gdp, count);
267 if (is_directory) { 265 if (is_directory) {
@@ -277,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
277 } 275 }
278 gdp->bg_checksum = ext4_group_desc_csum(sbi, 276 gdp->bg_checksum = ext4_group_desc_csum(sbi,
279 block_group, gdp); 277 block_group, gdp);
280 spin_unlock(sb_bgl_lock(sbi, block_group)); 278 ext4_unlock_group(sb, block_group);
281 percpu_counter_inc(&sbi->s_freeinodes_counter); 279 percpu_counter_inc(&sbi->s_freeinodes_counter);
282 if (is_directory) 280 if (is_directory)
283 percpu_counter_dec(&sbi->s_dirs_counter); 281 percpu_counter_dec(&sbi->s_dirs_counter);
@@ -316,7 +314,7 @@ error_return:
316static int find_group_dir(struct super_block *sb, struct inode *parent, 314static int find_group_dir(struct super_block *sb, struct inode *parent,
317 ext4_group_t *best_group) 315 ext4_group_t *best_group)
318{ 316{
319 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 317 ext4_group_t ngroups = ext4_get_groups_count(sb);
320 unsigned int freei, avefreei; 318 unsigned int freei, avefreei;
321 struct ext4_group_desc *desc, *best_desc = NULL; 319 struct ext4_group_desc *desc, *best_desc = NULL;
322 ext4_group_t group; 320 ext4_group_t group;
@@ -349,11 +347,10 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
349{ 347{
350 struct ext4_sb_info *sbi = EXT4_SB(sb); 348 struct ext4_sb_info *sbi = EXT4_SB(sb);
351 struct ext4_group_desc *desc; 349 struct ext4_group_desc *desc;
352 struct buffer_head *bh;
353 struct flex_groups *flex_group = sbi->s_flex_groups; 350 struct flex_groups *flex_group = sbi->s_flex_groups;
354 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 351 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
355 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); 352 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
356 ext4_group_t ngroups = sbi->s_groups_count; 353 ext4_group_t ngroups = ext4_get_groups_count(sb);
357 int flex_size = ext4_flex_bg_size(sbi); 354 int flex_size = ext4_flex_bg_size(sbi);
358 ext4_group_t best_flex = parent_fbg_group; 355 ext4_group_t best_flex = parent_fbg_group;
359 int blocks_per_flex = sbi->s_blocks_per_group * flex_size; 356 int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +359,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
362 ext4_group_t n_fbg_groups; 359 ext4_group_t n_fbg_groups;
363 ext4_group_t i; 360 ext4_group_t i;
364 361
365 n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> 362 n_fbg_groups = (ngroups + flex_size - 1) >>
366 sbi->s_log_groups_per_flex; 363 sbi->s_log_groups_per_flex;
367 364
368find_close_to_parent: 365find_close_to_parent:
@@ -404,7 +401,7 @@ find_close_to_parent:
404found_flexbg: 401found_flexbg:
405 for (i = best_flex * flex_size; i < ngroups && 402 for (i = best_flex * flex_size; i < ngroups &&
406 i < (best_flex + 1) * flex_size; i++) { 403 i < (best_flex + 1) * flex_size; i++) {
407 desc = ext4_get_group_desc(sb, i, &bh); 404 desc = ext4_get_group_desc(sb, i, NULL);
408 if (ext4_free_inodes_count(sb, desc)) { 405 if (ext4_free_inodes_count(sb, desc)) {
409 *best_group = i; 406 *best_group = i;
410 goto out; 407 goto out;
@@ -478,20 +475,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
478{ 475{
479 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 476 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
480 struct ext4_sb_info *sbi = EXT4_SB(sb); 477 struct ext4_sb_info *sbi = EXT4_SB(sb);
481 ext4_group_t ngroups = sbi->s_groups_count; 478 ext4_group_t real_ngroups = ext4_get_groups_count(sb);
482 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 479 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
483 unsigned int freei, avefreei; 480 unsigned int freei, avefreei;
484 ext4_fsblk_t freeb, avefreeb; 481 ext4_fsblk_t freeb, avefreeb;
485 unsigned int ndirs; 482 unsigned int ndirs;
486 int max_dirs, min_inodes; 483 int max_dirs, min_inodes;
487 ext4_grpblk_t min_blocks; 484 ext4_grpblk_t min_blocks;
488 ext4_group_t i, grp, g; 485 ext4_group_t i, grp, g, ngroups;
489 struct ext4_group_desc *desc; 486 struct ext4_group_desc *desc;
490 struct orlov_stats stats; 487 struct orlov_stats stats;
491 int flex_size = ext4_flex_bg_size(sbi); 488 int flex_size = ext4_flex_bg_size(sbi);
492 489
490 ngroups = real_ngroups;
493 if (flex_size > 1) { 491 if (flex_size > 1) {
494 ngroups = (ngroups + flex_size - 1) >> 492 ngroups = (real_ngroups + flex_size - 1) >>
495 sbi->s_log_groups_per_flex; 493 sbi->s_log_groups_per_flex;
496 parent_group >>= sbi->s_log_groups_per_flex; 494 parent_group >>= sbi->s_log_groups_per_flex;
497 } 495 }
@@ -543,7 +541,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
543 */ 541 */
544 grp *= flex_size; 542 grp *= flex_size;
545 for (i = 0; i < flex_size; i++) { 543 for (i = 0; i < flex_size; i++) {
546 if (grp+i >= sbi->s_groups_count) 544 if (grp+i >= real_ngroups)
547 break; 545 break;
548 desc = ext4_get_group_desc(sb, grp+i, NULL); 546 desc = ext4_get_group_desc(sb, grp+i, NULL);
549 if (desc && ext4_free_inodes_count(sb, desc)) { 547 if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +581,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
583 } 581 }
584 582
585fallback: 583fallback:
586 ngroups = sbi->s_groups_count; 584 ngroups = real_ngroups;
587 avefreei = freei / ngroups; 585 avefreei = freei / ngroups;
588fallback_retry: 586fallback_retry:
589 parent_group = EXT4_I(parent)->i_block_group; 587 parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +611,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
613 ext4_group_t *group, int mode) 611 ext4_group_t *group, int mode)
614{ 612{
615 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 613 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
616 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 614 ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
617 struct ext4_group_desc *desc; 615 struct ext4_group_desc *desc;
618 ext4_group_t i, last;
619 int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); 616 int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
620 617
621 /* 618 /*
@@ -708,10 +705,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
708 705
709/* 706/*
710 * claim the inode from the inode bitmap. If the group 707 * claim the inode from the inode bitmap. If the group
711 * is uninit we need to take the groups's sb_bgl_lock 708 * is uninit we need to take the groups's ext4_group_lock
712 * and clear the uninit flag. The inode bitmap update 709 * and clear the uninit flag. The inode bitmap update
713 * and group desc uninit flag clear should be done 710 * and group desc uninit flag clear should be done
714 * after holding sb_bgl_lock so that ext4_read_inode_bitmap 711 * after holding ext4_group_lock so that ext4_read_inode_bitmap
715 * doesn't race with the ext4_claim_inode 712 * doesn't race with the ext4_claim_inode
716 */ 713 */
717static int ext4_claim_inode(struct super_block *sb, 714static int ext4_claim_inode(struct super_block *sb,
@@ -722,7 +719,7 @@ static int ext4_claim_inode(struct super_block *sb,
722 struct ext4_sb_info *sbi = EXT4_SB(sb); 719 struct ext4_sb_info *sbi = EXT4_SB(sb);
723 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 720 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
724 721
725 spin_lock(sb_bgl_lock(sbi, group)); 722 ext4_lock_group(sb, group);
726 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { 723 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
727 /* not a free inode */ 724 /* not a free inode */
728 retval = 1; 725 retval = 1;
@@ -731,7 +728,7 @@ static int ext4_claim_inode(struct super_block *sb,
731 ino++; 728 ino++;
732 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 729 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
733 ino > EXT4_INODES_PER_GROUP(sb)) { 730 ino > EXT4_INODES_PER_GROUP(sb)) {
734 spin_unlock(sb_bgl_lock(sbi, group)); 731 ext4_unlock_group(sb, group);
735 ext4_error(sb, __func__, 732 ext4_error(sb, __func__,
736 "reserved inode or inode > inodes count - " 733 "reserved inode or inode > inodes count - "
737 "block_group = %u, inode=%lu", group, 734 "block_group = %u, inode=%lu", group,
@@ -780,7 +777,7 @@ static int ext4_claim_inode(struct super_block *sb,
780 } 777 }
781 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 778 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
782err_ret: 779err_ret:
783 spin_unlock(sb_bgl_lock(sbi, group)); 780 ext4_unlock_group(sb, group);
784 return retval; 781 return retval;
785} 782}
786 783
@@ -799,11 +796,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
799 struct super_block *sb; 796 struct super_block *sb;
800 struct buffer_head *inode_bitmap_bh = NULL; 797 struct buffer_head *inode_bitmap_bh = NULL;
801 struct buffer_head *group_desc_bh; 798 struct buffer_head *group_desc_bh;
802 ext4_group_t group = 0; 799 ext4_group_t ngroups, group = 0;
803 unsigned long ino = 0; 800 unsigned long ino = 0;
804 struct inode *inode; 801 struct inode *inode;
805 struct ext4_group_desc *gdp = NULL; 802 struct ext4_group_desc *gdp = NULL;
806 struct ext4_super_block *es;
807 struct ext4_inode_info *ei; 803 struct ext4_inode_info *ei;
808 struct ext4_sb_info *sbi; 804 struct ext4_sb_info *sbi;
809 int ret2, err = 0; 805 int ret2, err = 0;
@@ -818,15 +814,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
818 return ERR_PTR(-EPERM); 814 return ERR_PTR(-EPERM);
819 815
820 sb = dir->i_sb; 816 sb = dir->i_sb;
817 ngroups = ext4_get_groups_count(sb);
821 trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, 818 trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
822 dir->i_ino, mode); 819 dir->i_ino, mode);
823 inode = new_inode(sb); 820 inode = new_inode(sb);
824 if (!inode) 821 if (!inode)
825 return ERR_PTR(-ENOMEM); 822 return ERR_PTR(-ENOMEM);
826 ei = EXT4_I(inode); 823 ei = EXT4_I(inode);
827
828 sbi = EXT4_SB(sb); 824 sbi = EXT4_SB(sb);
829 es = sbi->s_es;
830 825
831 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { 826 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
832 ret2 = find_group_flex(sb, dir, &group); 827 ret2 = find_group_flex(sb, dir, &group);
@@ -856,7 +851,7 @@ got_group:
856 if (ret2 == -1) 851 if (ret2 == -1)
857 goto out; 852 goto out;
858 853
859 for (i = 0; i < sbi->s_groups_count; i++) { 854 for (i = 0; i < ngroups; i++) {
860 err = -EIO; 855 err = -EIO;
861 856
862 gdp = ext4_get_group_desc(sb, group, &group_desc_bh); 857 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -917,7 +912,7 @@ repeat_in_this_group:
917 * group descriptor metadata has not yet been updated. 912 * group descriptor metadata has not yet been updated.
918 * So we just go onto the next blockgroup. 913 * So we just go onto the next blockgroup.
919 */ 914 */
920 if (++group == sbi->s_groups_count) 915 if (++group == ngroups)
921 group = 0; 916 group = 0;
922 } 917 }
923 err = -ENOSPC; 918 err = -ENOSPC;
@@ -938,7 +933,7 @@ got:
938 } 933 }
939 934
940 free = 0; 935 free = 0;
941 spin_lock(sb_bgl_lock(sbi, group)); 936 ext4_lock_group(sb, group);
942 /* recheck and clear flag under lock if we still need to */ 937 /* recheck and clear flag under lock if we still need to */
943 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 938 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
944 free = ext4_free_blocks_after_init(sb, group, gdp); 939 free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -947,7 +942,7 @@ got:
947 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, 942 gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
948 gdp); 943 gdp);
949 } 944 }
950 spin_unlock(sb_bgl_lock(sbi, group)); 945 ext4_unlock_group(sb, group);
951 946
952 /* Don't need to dirty bitmap block if we didn't change it */ 947 /* Don't need to dirty bitmap block if we didn't change it */
953 if (free) { 948 if (free) {
@@ -1158,7 +1153,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1158{ 1153{
1159 unsigned long desc_count; 1154 unsigned long desc_count;
1160 struct ext4_group_desc *gdp; 1155 struct ext4_group_desc *gdp;
1161 ext4_group_t i; 1156 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1162#ifdef EXT4FS_DEBUG 1157#ifdef EXT4FS_DEBUG
1163 struct ext4_super_block *es; 1158 struct ext4_super_block *es;
1164 unsigned long bitmap_count, x; 1159 unsigned long bitmap_count, x;
@@ -1168,7 +1163,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1168 desc_count = 0; 1163 desc_count = 0;
1169 bitmap_count = 0; 1164 bitmap_count = 0;
1170 gdp = NULL; 1165 gdp = NULL;
1171 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1166 for (i = 0; i < ngroups; i++) {
1172 gdp = ext4_get_group_desc(sb, i, NULL); 1167 gdp = ext4_get_group_desc(sb, i, NULL);
1173 if (!gdp) 1168 if (!gdp)
1174 continue; 1169 continue;
@@ -1190,7 +1185,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1190 return desc_count; 1185 return desc_count;
1191#else 1186#else
1192 desc_count = 0; 1187 desc_count = 0;
1193 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1188 for (i = 0; i < ngroups; i++) {
1194 gdp = ext4_get_group_desc(sb, i, NULL); 1189 gdp = ext4_get_group_desc(sb, i, NULL);
1195 if (!gdp) 1190 if (!gdp)
1196 continue; 1191 continue;
@@ -1205,9 +1200,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1205unsigned long ext4_count_dirs(struct super_block * sb) 1200unsigned long ext4_count_dirs(struct super_block * sb)
1206{ 1201{
1207 unsigned long count = 0; 1202 unsigned long count = 0;
1208 ext4_group_t i; 1203 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1209 1204
1210 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1205 for (i = 0; i < ngroups; i++) {
1211 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1206 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1212 if (!gdp) 1207 if (!gdp)
1213 continue; 1208 continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a9ffd528dd1..875db944b22f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode,
372} 372}
373 373
374static int __ext4_check_blockref(const char *function, struct inode *inode, 374static int __ext4_check_blockref(const char *function, struct inode *inode,
375 __le32 *p, unsigned int max) { 375 __le32 *p, unsigned int max)
376 376{
377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
378 __le32 *bref = p; 377 __le32 *bref = p;
378 unsigned int blk;
379
379 while (bref < p+max) { 380 while (bref < p+max) {
380 if (unlikely(le32_to_cpu(*bref) >= maxblocks)) { 381 blk = le32_to_cpu(*bref++);
382 if (blk &&
383 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
384 blk, 1))) {
381 ext4_error(inode->i_sb, function, 385 ext4_error(inode->i_sb, function,
382 "block reference %u >= max (%u) " 386 "invalid block reference %u "
383 "in inode #%lu, offset=%d", 387 "in inode #%lu", blk, inode->i_ino);
384 le32_to_cpu(*bref), maxblocks,
385 inode->i_ino, (int)(bref-p));
386 return -EIO; 388 return -EIO;
387 } 389 }
388 bref++;
389 } 390 }
390 return 0; 391 return 0;
391} 392}
@@ -892,6 +893,10 @@ err_out:
892} 893}
893 894
894/* 895/*
896 * The ext4_ind_get_blocks() function handles non-extents inodes
897 * (i.e., using the traditional indirect/double-indirect i_blocks
898 * scheme) for ext4_get_blocks().
899 *
895 * Allocation strategy is simple: if we have to allocate something, we will 900 * Allocation strategy is simple: if we have to allocate something, we will
896 * have to go the whole way to leaf. So let's do it before attaching anything 901 * have to go the whole way to leaf. So let's do it before attaching anything
897 * to tree, set linkage between the newborn blocks, write them if sync is 902 * to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,15 +914,16 @@ err_out:
909 * return = 0, if plain lookup failed. 914 * return = 0, if plain lookup failed.
910 * return < 0, error case. 915 * return < 0, error case.
911 * 916 *
912 * 917 * The ext4_ind_get_blocks() function should be called with
913 * Need to be called with 918 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
914 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 919 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
915 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 920 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
921 * blocks.
916 */ 922 */
917static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 923static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
918 ext4_lblk_t iblock, unsigned int maxblocks, 924 ext4_lblk_t iblock, unsigned int maxblocks,
919 struct buffer_head *bh_result, 925 struct buffer_head *bh_result,
920 int create, int extend_disksize) 926 int flags)
921{ 927{
922 int err = -EIO; 928 int err = -EIO;
923 ext4_lblk_t offsets[4]; 929 ext4_lblk_t offsets[4];
@@ -927,14 +933,11 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
927 int indirect_blks; 933 int indirect_blks;
928 int blocks_to_boundary = 0; 934 int blocks_to_boundary = 0;
929 int depth; 935 int depth;
930 struct ext4_inode_info *ei = EXT4_I(inode);
931 int count = 0; 936 int count = 0;
932 ext4_fsblk_t first_block = 0; 937 ext4_fsblk_t first_block = 0;
933 loff_t disksize;
934
935 938
936 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 939 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
937 J_ASSERT(handle != NULL || create == 0); 940 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
938 depth = ext4_block_to_path(inode, iblock, offsets, 941 depth = ext4_block_to_path(inode, iblock, offsets,
939 &blocks_to_boundary); 942 &blocks_to_boundary);
940 943
@@ -963,7 +966,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
963 } 966 }
964 967
965 /* Next simple case - plain lookup or failed read of indirect block */ 968 /* Next simple case - plain lookup or failed read of indirect block */
966 if (!create || err == -EIO) 969 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
967 goto cleanup; 970 goto cleanup;
968 971
969 /* 972 /*
@@ -997,19 +1000,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
997 if (!err) 1000 if (!err)
998 err = ext4_splice_branch(handle, inode, iblock, 1001 err = ext4_splice_branch(handle, inode, iblock,
999 partial, indirect_blks, count); 1002 partial, indirect_blks, count);
1000 /* 1003 else
1001 * i_disksize growing is protected by i_data_sem. Don't forget to
1002 * protect it if you're about to implement concurrent
1003 * ext4_get_block() -bzzz
1004 */
1005 if (!err && extend_disksize) {
1006 disksize = ((loff_t) iblock + count) << inode->i_blkbits;
1007 if (disksize > i_size_read(inode))
1008 disksize = i_size_read(inode);
1009 if (disksize > ei->i_disksize)
1010 ei->i_disksize = disksize;
1011 }
1012 if (err)
1013 goto cleanup; 1004 goto cleanup;
1014 1005
1015 set_buffer_new(bh_result); 1006 set_buffer_new(bh_result);
@@ -1120,8 +1111,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1120 ext4_discard_preallocations(inode); 1111 ext4_discard_preallocations(inode);
1121} 1112}
1122 1113
1114static int check_block_validity(struct inode *inode, sector_t logical,
1115 sector_t phys, int len)
1116{
1117 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1118 ext4_error(inode->i_sb, "check_block_validity",
1119 "inode #%lu logical block %llu mapped to %llu "
1120 "(size %d)", inode->i_ino,
1121 (unsigned long long) logical,
1122 (unsigned long long) phys, len);
1123 WARN_ON(1);
1124 return -EIO;
1125 }
1126 return 0;
1127}
1128
1123/* 1129/*
1124 * The ext4_get_blocks_wrap() function try to look up the requested blocks, 1130 * The ext4_get_blocks() function tries to look up the requested blocks,
1125 * and returns if the blocks are already mapped. 1131 * and returns if the blocks are already mapped.
1126 * 1132 *
1127 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1133 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1129,7 +1135,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1129 * mapped. 1135 * mapped.
1130 * 1136 *
1131 * If file type is extents based, it will call ext4_ext_get_blocks(), 1137 * If file type is extents based, it will call ext4_ext_get_blocks(),
1132 * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping 1138 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
1133 * based files 1139 * based files
1134 * 1140 *
1135 * On success, it returns the number of blocks being mapped or allocate. 1141 * On success, it returns the number of blocks being mapped or allocate.
@@ -1142,9 +1148,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1142 * 1148 *
1143 * It returns the error in case of allocation failure. 1149 * It returns the error in case of allocation failure.
1144 */ 1150 */
1145int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1151int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1146 unsigned int max_blocks, struct buffer_head *bh, 1152 unsigned int max_blocks, struct buffer_head *bh,
1147 int create, int extend_disksize, int flag) 1153 int flags)
1148{ 1154{
1149 int retval; 1155 int retval;
1150 1156
@@ -1152,21 +1158,28 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1152 clear_buffer_unwritten(bh); 1158 clear_buffer_unwritten(bh);
1153 1159
1154 /* 1160 /*
1155 * Try to see if we can get the block without requesting 1161 * Try to see if we can get the block without requesting a new
1156 * for new file system block. 1162 * file system block.
1157 */ 1163 */
1158 down_read((&EXT4_I(inode)->i_data_sem)); 1164 down_read((&EXT4_I(inode)->i_data_sem));
1159 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1165 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1160 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1166 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
1161 bh, 0, 0); 1167 bh, 0);
1162 } else { 1168 } else {
1163 retval = ext4_get_blocks_handle(handle, 1169 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
1164 inode, block, max_blocks, bh, 0, 0); 1170 bh, 0);
1165 } 1171 }
1166 up_read((&EXT4_I(inode)->i_data_sem)); 1172 up_read((&EXT4_I(inode)->i_data_sem));
1167 1173
1174 if (retval > 0 && buffer_mapped(bh)) {
1175 int ret = check_block_validity(inode, block,
1176 bh->b_blocknr, retval);
1177 if (ret != 0)
1178 return ret;
1179 }
1180
1168 /* If it is only a block(s) look up */ 1181 /* If it is only a block(s) look up */
1169 if (!create) 1182 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
1170 return retval; 1183 return retval;
1171 1184
1172 /* 1185 /*
@@ -1205,7 +1218,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1205 * let the underlying get_block() function know to 1218 * let the underlying get_block() function know to
1206 * avoid double accounting 1219 * avoid double accounting
1207 */ 1220 */
1208 if (flag) 1221 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1209 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1222 EXT4_I(inode)->i_delalloc_reserved_flag = 1;
1210 /* 1223 /*
1211 * We need to check for EXT4 here because migrate 1224 * We need to check for EXT4 here because migrate
@@ -1213,10 +1226,10 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1213 */ 1226 */
1214 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1227 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1215 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1228 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
1216 bh, create, extend_disksize); 1229 bh, flags);
1217 } else { 1230 } else {
1218 retval = ext4_get_blocks_handle(handle, inode, block, 1231 retval = ext4_ind_get_blocks(handle, inode, block,
1219 max_blocks, bh, create, extend_disksize); 1232 max_blocks, bh, flags);
1220 1233
1221 if (retval > 0 && buffer_new(bh)) { 1234 if (retval > 0 && buffer_new(bh)) {
1222 /* 1235 /*
@@ -1229,18 +1242,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1229 } 1242 }
1230 } 1243 }
1231 1244
1232 if (flag) { 1245 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1233 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1246 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1234 /* 1247
1235 * Update reserved blocks/metadata blocks 1248 /*
1236 * after successful block allocation 1249 * Update reserved blocks/metadata blocks after successful
1237 * which were deferred till now 1250 * block allocation which had been deferred till now.
1238 */ 1251 */
1239 if ((retval > 0) && buffer_delay(bh)) 1252 if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
1240 ext4_da_update_reserve_space(inode, retval); 1253 ext4_da_update_reserve_space(inode, retval);
1241 }
1242 1254
1243 up_write((&EXT4_I(inode)->i_data_sem)); 1255 up_write((&EXT4_I(inode)->i_data_sem));
1256 if (retval > 0 && buffer_mapped(bh)) {
1257 int ret = check_block_validity(inode, block,
1258 bh->b_blocknr, retval);
1259 if (ret != 0)
1260 return ret;
1261 }
1244 return retval; 1262 return retval;
1245} 1263}
1246 1264
@@ -1268,8 +1286,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
1268 started = 1; 1286 started = 1;
1269 } 1287 }
1270 1288
1271 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1289 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
1272 max_blocks, bh_result, create, 0, 0); 1290 create ? EXT4_GET_BLOCKS_CREATE : 0);
1273 if (ret > 0) { 1291 if (ret > 0) {
1274 bh_result->b_size = (ret << inode->i_blkbits); 1292 bh_result->b_size = (ret << inode->i_blkbits);
1275 ret = 0; 1293 ret = 0;
@@ -1288,17 +1306,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1288{ 1306{
1289 struct buffer_head dummy; 1307 struct buffer_head dummy;
1290 int fatal = 0, err; 1308 int fatal = 0, err;
1309 int flags = 0;
1291 1310
1292 J_ASSERT(handle != NULL || create == 0); 1311 J_ASSERT(handle != NULL || create == 0);
1293 1312
1294 dummy.b_state = 0; 1313 dummy.b_state = 0;
1295 dummy.b_blocknr = -1000; 1314 dummy.b_blocknr = -1000;
1296 buffer_trace_init(&dummy.b_history); 1315 buffer_trace_init(&dummy.b_history);
1297 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1316 if (create)
1298 &dummy, create, 1, 0); 1317 flags |= EXT4_GET_BLOCKS_CREATE;
1318 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
1299 /* 1319 /*
1300 * ext4_get_blocks_handle() returns number of blocks 1320 * ext4_get_blocks() returns number of blocks mapped. 0 in
1301 * mapped. 0 in case of a HOLE. 1321 * case of a HOLE.
1302 */ 1322 */
1303 if (err > 0) { 1323 if (err > 0) {
1304 if (err > 1) 1324 if (err > 1)
@@ -1439,7 +1459,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1439 struct page **pagep, void **fsdata) 1459 struct page **pagep, void **fsdata)
1440{ 1460{
1441 struct inode *inode = mapping->host; 1461 struct inode *inode = mapping->host;
1442 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1462 int ret, needed_blocks;
1443 handle_t *handle; 1463 handle_t *handle;
1444 int retries = 0; 1464 int retries = 0;
1445 struct page *page; 1465 struct page *page;
@@ -1450,6 +1470,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1450 "dev %s ino %lu pos %llu len %u flags %u", 1470 "dev %s ino %lu pos %llu len %u flags %u",
1451 inode->i_sb->s_id, inode->i_ino, 1471 inode->i_sb->s_id, inode->i_ino,
1452 (unsigned long long) pos, len, flags); 1472 (unsigned long long) pos, len, flags);
1473 /*
1474 * Reserve one block more for addition to orphan list in case
1475 * we allocate blocks but write fails for some reason
1476 */
1477 needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
1453 index = pos >> PAGE_CACHE_SHIFT; 1478 index = pos >> PAGE_CACHE_SHIFT;
1454 from = pos & (PAGE_CACHE_SIZE - 1); 1479 from = pos & (PAGE_CACHE_SIZE - 1);
1455 to = from + len; 1480 to = from + len;
@@ -1483,15 +1508,30 @@ retry:
1483 1508
1484 if (ret) { 1509 if (ret) {
1485 unlock_page(page); 1510 unlock_page(page);
1486 ext4_journal_stop(handle);
1487 page_cache_release(page); 1511 page_cache_release(page);
1488 /* 1512 /*
1489 * block_write_begin may have instantiated a few blocks 1513 * block_write_begin may have instantiated a few blocks
1490 * outside i_size. Trim these off again. Don't need 1514 * outside i_size. Trim these off again. Don't need
1491 * i_size_read because we hold i_mutex. 1515 * i_size_read because we hold i_mutex.
1516 *
1517 * Add inode to orphan list in case we crash before
1518 * truncate finishes
1492 */ 1519 */
1493 if (pos + len > inode->i_size) 1520 if (pos + len > inode->i_size)
1521 ext4_orphan_add(handle, inode);
1522
1523 ext4_journal_stop(handle);
1524 if (pos + len > inode->i_size) {
1494 vmtruncate(inode, inode->i_size); 1525 vmtruncate(inode, inode->i_size);
1526 /*
1527 * If vmtruncate failed early the inode might
1528 * still be on the orphan list; we need to
1529 * make sure the inode is removed from the
1530 * orphan list in that case.
1531 */
1532 if (inode->i_nlink)
1533 ext4_orphan_del(NULL, inode);
1534 }
1495 } 1535 }
1496 1536
1497 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1537 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1509,6 +1549,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1509 return ext4_handle_dirty_metadata(handle, NULL, bh); 1549 return ext4_handle_dirty_metadata(handle, NULL, bh);
1510} 1550}
1511 1551
1552static int ext4_generic_write_end(struct file *file,
1553 struct address_space *mapping,
1554 loff_t pos, unsigned len, unsigned copied,
1555 struct page *page, void *fsdata)
1556{
1557 int i_size_changed = 0;
1558 struct inode *inode = mapping->host;
1559 handle_t *handle = ext4_journal_current_handle();
1560
1561 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1562
1563 /*
1564 * No need to use i_size_read() here, the i_size
1565 * cannot change under us because we hold i_mutex.
1566 *
1567 * But it's important to update i_size while still holding page lock:
1568 * page writeout could otherwise come in and zero beyond i_size.
1569 */
1570 if (pos + copied > inode->i_size) {
1571 i_size_write(inode, pos + copied);
1572 i_size_changed = 1;
1573 }
1574
1575 if (pos + copied > EXT4_I(inode)->i_disksize) {
1576 /* We need to mark inode dirty even if
1577 * new_i_size is less that inode->i_size
1578 * bu greater than i_disksize.(hint delalloc)
1579 */
1580 ext4_update_i_disksize(inode, (pos + copied));
1581 i_size_changed = 1;
1582 }
1583 unlock_page(page);
1584 page_cache_release(page);
1585
1586 /*
1587 * Don't mark the inode dirty under page lock. First, it unnecessarily
1588 * makes the holding time of page lock longer. Second, it forces lock
1589 * ordering of page lock and transaction start for journaling
1590 * filesystems.
1591 */
1592 if (i_size_changed)
1593 ext4_mark_inode_dirty(handle, inode);
1594
1595 return copied;
1596}
1597
1512/* 1598/*
1513 * We need to pick up the new inode size which generic_commit_write gave us 1599 * We need to pick up the new inode size which generic_commit_write gave us
1514 * `file' can be NULL - eg, when called from page_symlink(). 1600 * `file' can be NULL - eg, when called from page_symlink().
@@ -1532,21 +1618,15 @@ static int ext4_ordered_write_end(struct file *file,
1532 ret = ext4_jbd2_file_inode(handle, inode); 1618 ret = ext4_jbd2_file_inode(handle, inode);
1533 1619
1534 if (ret == 0) { 1620 if (ret == 0) {
1535 loff_t new_i_size; 1621 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1536
1537 new_i_size = pos + copied;
1538 if (new_i_size > EXT4_I(inode)->i_disksize) {
1539 ext4_update_i_disksize(inode, new_i_size);
1540 /* We need to mark inode dirty even if
1541 * new_i_size is less that inode->i_size
1542 * bu greater than i_disksize.(hint delalloc)
1543 */
1544 ext4_mark_inode_dirty(handle, inode);
1545 }
1546
1547 ret2 = generic_write_end(file, mapping, pos, len, copied,
1548 page, fsdata); 1622 page, fsdata);
1549 copied = ret2; 1623 copied = ret2;
1624 if (pos + len > inode->i_size)
1625 /* if we have allocated more blocks and copied
1626 * less. We will have blocks allocated outside
1627 * inode->i_size. So truncate them
1628 */
1629 ext4_orphan_add(handle, inode);
1550 if (ret2 < 0) 1630 if (ret2 < 0)
1551 ret = ret2; 1631 ret = ret2;
1552 } 1632 }
@@ -1554,6 +1634,18 @@ static int ext4_ordered_write_end(struct file *file,
1554 if (!ret) 1634 if (!ret)
1555 ret = ret2; 1635 ret = ret2;
1556 1636
1637 if (pos + len > inode->i_size) {
1638 vmtruncate(inode, inode->i_size);
1639 /*
1640 * If vmtruncate failed early the inode might still be
1641 * on the orphan list; we need to make sure the inode
1642 * is removed from the orphan list in that case.
1643 */
1644 if (inode->i_nlink)
1645 ext4_orphan_del(NULL, inode);
1646 }
1647
1648
1557 return ret ? ret : copied; 1649 return ret ? ret : copied;
1558} 1650}
1559 1651
@@ -1565,25 +1657,21 @@ static int ext4_writeback_write_end(struct file *file,
1565 handle_t *handle = ext4_journal_current_handle(); 1657 handle_t *handle = ext4_journal_current_handle();
1566 struct inode *inode = mapping->host; 1658 struct inode *inode = mapping->host;
1567 int ret = 0, ret2; 1659 int ret = 0, ret2;
1568 loff_t new_i_size;
1569 1660
1570 trace_mark(ext4_writeback_write_end, 1661 trace_mark(ext4_writeback_write_end,
1571 "dev %s ino %lu pos %llu len %u copied %u", 1662 "dev %s ino %lu pos %llu len %u copied %u",
1572 inode->i_sb->s_id, inode->i_ino, 1663 inode->i_sb->s_id, inode->i_ino,
1573 (unsigned long long) pos, len, copied); 1664 (unsigned long long) pos, len, copied);
1574 new_i_size = pos + copied; 1665 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1575 if (new_i_size > EXT4_I(inode)->i_disksize) {
1576 ext4_update_i_disksize(inode, new_i_size);
1577 /* We need to mark inode dirty even if
1578 * new_i_size is less that inode->i_size
1579 * bu greater than i_disksize.(hint delalloc)
1580 */
1581 ext4_mark_inode_dirty(handle, inode);
1582 }
1583
1584 ret2 = generic_write_end(file, mapping, pos, len, copied,
1585 page, fsdata); 1666 page, fsdata);
1586 copied = ret2; 1667 copied = ret2;
1668 if (pos + len > inode->i_size)
1669 /* if we have allocated more blocks and copied
1670 * less. We will have blocks allocated outside
1671 * inode->i_size. So truncate them
1672 */
1673 ext4_orphan_add(handle, inode);
1674
1587 if (ret2 < 0) 1675 if (ret2 < 0)
1588 ret = ret2; 1676 ret = ret2;
1589 1677
@@ -1591,6 +1679,17 @@ static int ext4_writeback_write_end(struct file *file,
1591 if (!ret) 1679 if (!ret)
1592 ret = ret2; 1680 ret = ret2;
1593 1681
1682 if (pos + len > inode->i_size) {
1683 vmtruncate(inode, inode->i_size);
1684 /*
1685 * If vmtruncate failed early the inode might still be
1686 * on the orphan list; we need to make sure the inode
1687 * is removed from the orphan list in that case.
1688 */
1689 if (inode->i_nlink)
1690 ext4_orphan_del(NULL, inode);
1691 }
1692
1594 return ret ? ret : copied; 1693 return ret ? ret : copied;
1595} 1694}
1596 1695
@@ -1635,10 +1734,27 @@ static int ext4_journalled_write_end(struct file *file,
1635 } 1734 }
1636 1735
1637 unlock_page(page); 1736 unlock_page(page);
1737 page_cache_release(page);
1738 if (pos + len > inode->i_size)
1739 /* if we have allocated more blocks and copied
1740 * less. We will have blocks allocated outside
1741 * inode->i_size. So truncate them
1742 */
1743 ext4_orphan_add(handle, inode);
1744
1638 ret2 = ext4_journal_stop(handle); 1745 ret2 = ext4_journal_stop(handle);
1639 if (!ret) 1746 if (!ret)
1640 ret = ret2; 1747 ret = ret2;
1641 page_cache_release(page); 1748 if (pos + len > inode->i_size) {
1749 vmtruncate(inode, inode->i_size);
1750 /*
1751 * If vmtruncate failed early the inode might still be
1752 * on the orphan list; we need to make sure the inode
1753 * is removed from the orphan list in that case.
1754 */
1755 if (inode->i_nlink)
1756 ext4_orphan_del(NULL, inode);
1757 }
1642 1758
1643 return ret ? ret : copied; 1759 return ret ? ret : copied;
1644} 1760}
@@ -1852,7 +1968,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1852 * @logical - first logical block to start assignment with 1968 * @logical - first logical block to start assignment with
1853 * 1969 *
1854 * the function goes through all passed space and put actual disk 1970 * the function goes through all passed space and put actual disk
1855 * block numbers into buffer heads, dropping BH_Delay 1971 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
1856 */ 1972 */
1857static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 1973static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1858 struct buffer_head *exbh) 1974 struct buffer_head *exbh)
@@ -1902,16 +2018,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1902 do { 2018 do {
1903 if (cur_logical >= logical + blocks) 2019 if (cur_logical >= logical + blocks)
1904 break; 2020 break;
1905 if (buffer_delay(bh)) { 2021
1906 bh->b_blocknr = pblock; 2022 if (buffer_delay(bh) ||
1907 clear_buffer_delay(bh); 2023 buffer_unwritten(bh)) {
1908 bh->b_bdev = inode->i_sb->s_bdev; 2024
1909 } else if (buffer_unwritten(bh)) { 2025 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
1910 bh->b_blocknr = pblock; 2026
1911 clear_buffer_unwritten(bh); 2027 if (buffer_delay(bh)) {
1912 set_buffer_mapped(bh); 2028 clear_buffer_delay(bh);
1913 set_buffer_new(bh); 2029 bh->b_blocknr = pblock;
1914 bh->b_bdev = inode->i_sb->s_bdev; 2030 } else {
2031 /*
2032 * unwritten already should have
2033 * blocknr assigned. Verify that
2034 */
2035 clear_buffer_unwritten(bh);
2036 BUG_ON(bh->b_blocknr != pblock);
2037 }
2038
1915 } else if (buffer_mapped(bh)) 2039 } else if (buffer_mapped(bh))
1916 BUG_ON(bh->b_blocknr != pblock); 2040 BUG_ON(bh->b_blocknr != pblock);
1917 2041
@@ -1990,51 +2114,6 @@ static void ext4_print_free_blocks(struct inode *inode)
1990 return; 2114 return;
1991} 2115}
1992 2116
1993#define EXT4_DELALLOC_RSVED 1
1994static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1995 struct buffer_head *bh_result, int create)
1996{
1997 int ret;
1998 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1999 loff_t disksize = EXT4_I(inode)->i_disksize;
2000 handle_t *handle = NULL;
2001
2002 handle = ext4_journal_current_handle();
2003 BUG_ON(!handle);
2004 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2005 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2006 if (ret <= 0)
2007 return ret;
2008
2009 bh_result->b_size = (ret << inode->i_blkbits);
2010
2011 if (ext4_should_order_data(inode)) {
2012 int retval;
2013 retval = ext4_jbd2_file_inode(handle, inode);
2014 if (retval)
2015 /*
2016 * Failed to add inode for ordered mode. Don't
2017 * update file size
2018 */
2019 return retval;
2020 }
2021
2022 /*
2023 * Update on-disk size along with block allocation we don't
2024 * use 'extend_disksize' as size may change within already
2025 * allocated block -bzzz
2026 */
2027 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2028 if (disksize > i_size_read(inode))
2029 disksize = i_size_read(inode);
2030 if (disksize > EXT4_I(inode)->i_disksize) {
2031 ext4_update_i_disksize(inode, disksize);
2032 ret = ext4_mark_inode_dirty(handle, inode);
2033 return ret;
2034 }
2035 return 0;
2036}
2037
2038/* 2117/*
2039 * mpage_da_map_blocks - go through given space 2118 * mpage_da_map_blocks - go through given space
2040 * 2119 *
@@ -2045,29 +2124,57 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2045 */ 2124 */
2046static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2125static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2047{ 2126{
2048 int err = 0; 2127 int err, blks, get_blocks_flags;
2049 struct buffer_head new; 2128 struct buffer_head new;
2050 sector_t next; 2129 sector_t next = mpd->b_blocknr;
2130 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2131 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2132 handle_t *handle = NULL;
2051 2133
2052 /* 2134 /*
2053 * We consider only non-mapped and non-allocated blocks 2135 * We consider only non-mapped and non-allocated blocks
2054 */ 2136 */
2055 if ((mpd->b_state & (1 << BH_Mapped)) && 2137 if ((mpd->b_state & (1 << BH_Mapped)) &&
2056 !(mpd->b_state & (1 << BH_Delay))) 2138 !(mpd->b_state & (1 << BH_Delay)) &&
2139 !(mpd->b_state & (1 << BH_Unwritten)))
2057 return 0; 2140 return 0;
2058 new.b_state = mpd->b_state; 2141
2059 new.b_blocknr = 0;
2060 new.b_size = mpd->b_size;
2061 next = mpd->b_blocknr;
2062 /* 2142 /*
2063 * If we didn't accumulate anything 2143 * If we didn't accumulate anything to write simply return
2064 * to write simply return
2065 */ 2144 */
2066 if (!new.b_size) 2145 if (!mpd->b_size)
2067 return 0; 2146 return 0;
2068 2147
2069 err = ext4_da_get_block_write(mpd->inode, next, &new, 1); 2148 handle = ext4_journal_current_handle();
2070 if (err) { 2149 BUG_ON(!handle);
2150
2151 /*
2152 * Call ext4_get_blocks() to allocate any delayed allocation
2153 * blocks, or to convert an uninitialized extent to be
2154 * initialized (in the case where we have written into
2155 * one or more preallocated blocks).
2156 *
2157 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
2158 * indicate that we are on the delayed allocation path. This
2159 * affects functions in many different parts of the allocation
2160 * call path. This flag exists primarily because we don't
2161 * want to change *many* call functions, so ext4_get_blocks()
2162 * will set the magic i_delalloc_reserved_flag once the
2163 * inode's allocation semaphore is taken.
2164 *
2165 * If the blocks in questions were delalloc blocks, set
2166 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2167 * variables are updated after the blocks have been allocated.
2168 */
2169 new.b_state = 0;
2170 get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
2171 EXT4_GET_BLOCKS_DELALLOC_RESERVE);
2172 if (mpd->b_state & (1 << BH_Delay))
2173 get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
2174 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
2175 &new, get_blocks_flags);
2176 if (blks < 0) {
2177 err = blks;
2071 /* 2178 /*
2072 * If get block returns with error we simply 2179 * If get block returns with error we simply
2073 * return. Later writepage will redirty the page and 2180 * return. Later writepage will redirty the page and
@@ -2100,12 +2207,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2100 if (err == -ENOSPC) { 2207 if (err == -ENOSPC) {
2101 ext4_print_free_blocks(mpd->inode); 2208 ext4_print_free_blocks(mpd->inode);
2102 } 2209 }
2103 /* invlaidate all the pages */ 2210 /* invalidate all the pages */
2104 ext4_da_block_invalidatepages(mpd, next, 2211 ext4_da_block_invalidatepages(mpd, next,
2105 mpd->b_size >> mpd->inode->i_blkbits); 2212 mpd->b_size >> mpd->inode->i_blkbits);
2106 return err; 2213 return err;
2107 } 2214 }
2108 BUG_ON(new.b_size == 0); 2215 BUG_ON(blks == 0);
2216
2217 new.b_size = (blks << mpd->inode->i_blkbits);
2109 2218
2110 if (buffer_new(&new)) 2219 if (buffer_new(&new))
2111 __unmap_underlying_blocks(mpd->inode, &new); 2220 __unmap_underlying_blocks(mpd->inode, &new);
@@ -2118,6 +2227,23 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2118 (mpd->b_state & (1 << BH_Unwritten))) 2227 (mpd->b_state & (1 << BH_Unwritten)))
2119 mpage_put_bnr_to_bhs(mpd, next, &new); 2228 mpage_put_bnr_to_bhs(mpd, next, &new);
2120 2229
2230 if (ext4_should_order_data(mpd->inode)) {
2231 err = ext4_jbd2_file_inode(handle, mpd->inode);
2232 if (err)
2233 return err;
2234 }
2235
2236 /*
2237 * Update on-disk size along with block allocation.
2238 */
2239 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
2240 if (disksize > i_size_read(mpd->inode))
2241 disksize = i_size_read(mpd->inode);
2242 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2243 ext4_update_i_disksize(mpd->inode, disksize);
2244 return ext4_mark_inode_dirty(handle, mpd->inode);
2245 }
2246
2121 return 0; 2247 return 0;
2122} 2248}
2123 2249
@@ -2192,6 +2318,17 @@ flush_it:
2192 return; 2318 return;
2193} 2319}
2194 2320
2321static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2322{
2323 /*
2324 * unmapped buffer is possible for holes.
2325 * delay buffer is possible with delayed allocation.
2326 * We also need to consider unwritten buffer as unmapped.
2327 */
2328 return (!buffer_mapped(bh) || buffer_delay(bh) ||
2329 buffer_unwritten(bh)) && buffer_dirty(bh);
2330}
2331
2195/* 2332/*
2196 * __mpage_da_writepage - finds extent of pages and blocks 2333 * __mpage_da_writepage - finds extent of pages and blocks
2197 * 2334 *
@@ -2276,8 +2413,7 @@ static int __mpage_da_writepage(struct page *page,
2276 * Otherwise we won't make progress 2413 * Otherwise we won't make progress
2277 * with the page in ext4_da_writepage 2414 * with the page in ext4_da_writepage
2278 */ 2415 */
2279 if (buffer_dirty(bh) && 2416 if (ext4_bh_unmapped_or_delay(NULL, bh)) {
2280 (!buffer_mapped(bh) || buffer_delay(bh))) {
2281 mpage_add_bh_to_extent(mpd, logical, 2417 mpage_add_bh_to_extent(mpd, logical,
2282 bh->b_size, 2418 bh->b_size,
2283 bh->b_state); 2419 bh->b_state);
@@ -2303,8 +2439,16 @@ static int __mpage_da_writepage(struct page *page,
2303} 2439}
2304 2440
2305/* 2441/*
2306 * this is a special callback for ->write_begin() only 2442 * This is a special get_blocks_t callback which is used by
2307 * it's intention is to return mapped block or reserve space 2443 * ext4_da_write_begin(). It will either return mapped block or
2444 * reserve space for a single block.
2445 *
2446 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
2447 * We also have b_blocknr = -1 and b_bdev initialized properly
2448 *
2449 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
2450 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
2451 * initialized properly.
2308 */ 2452 */
2309static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2453static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2310 struct buffer_head *bh_result, int create) 2454 struct buffer_head *bh_result, int create)
@@ -2323,7 +2467,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2323 * preallocated blocks are unmapped but should treated 2467 * preallocated blocks are unmapped but should treated
2324 * the same as allocated blocks. 2468 * the same as allocated blocks.
2325 */ 2469 */
2326 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); 2470 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0);
2327 if ((ret == 0) && !buffer_delay(bh_result)) { 2471 if ((ret == 0) && !buffer_delay(bh_result)) {
2328 /* the block isn't (pre)allocated yet, let's reserve space */ 2472 /* the block isn't (pre)allocated yet, let's reserve space */
2329 /* 2473 /*
@@ -2340,40 +2484,53 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2340 set_buffer_delay(bh_result); 2484 set_buffer_delay(bh_result);
2341 } else if (ret > 0) { 2485 } else if (ret > 0) {
2342 bh_result->b_size = (ret << inode->i_blkbits); 2486 bh_result->b_size = (ret << inode->i_blkbits);
2343 /* 2487 if (buffer_unwritten(bh_result)) {
2344 * With sub-block writes into unwritten extents 2488 /* A delayed write to unwritten bh should
2345 * we also need to mark the buffer as new so that 2489 * be marked new and mapped. Mapped ensures
2346 * the unwritten parts of the buffer gets correctly zeroed. 2490 * that we don't do get_block multiple times
2347 */ 2491 * when we write to the same offset and new
2348 if (buffer_unwritten(bh_result)) 2492 * ensures that we do proper zero out for
2493 * partial write.
2494 */
2349 set_buffer_new(bh_result); 2495 set_buffer_new(bh_result);
2496 set_buffer_mapped(bh_result);
2497 }
2350 ret = 0; 2498 ret = 0;
2351 } 2499 }
2352 2500
2353 return ret; 2501 return ret;
2354} 2502}
2355 2503
2356static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2504/*
2357{ 2505 * This function is used as a standard get_block_t calback function
2358 /* 2506 * when there is no desire to allocate any blocks. It is used as a
2359 * unmapped buffer is possible for holes. 2507 * callback function for block_prepare_write(), nobh_writepage(), and
2360 * delay buffer is possible with delayed allocation 2508 * block_write_full_page(). These functions should only try to map a
2361 */ 2509 * single block at a time.
2362 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); 2510 *
2363} 2511 * Since this function doesn't do block allocations even if the caller
2364 2512 * requests it by passing in create=1, it is critically important that
2365static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, 2513 * any caller checks to make sure that any buffer heads are returned
2514 * by this function are either all already mapped or marked for
2515 * delayed allocation before calling nobh_writepage() or
2516 * block_write_full_page(). Otherwise, b_blocknr could be left
2517 * unitialized, and the page write functions will be taken by
2518 * surprise.
2519 */
2520static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2366 struct buffer_head *bh_result, int create) 2521 struct buffer_head *bh_result, int create)
2367{ 2522{
2368 int ret = 0; 2523 int ret = 0;
2369 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2524 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2370 2525
2526 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2527
2371 /* 2528 /*
2372 * we don't want to do block allocation in writepage 2529 * we don't want to do block allocation in writepage
2373 * so call get_block_wrap with create = 0 2530 * so call get_block_wrap with create = 0
2374 */ 2531 */
2375 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, 2532 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
2376 bh_result, 0, 0, 0); 2533 BUG_ON(create && ret == 0);
2377 if (ret > 0) { 2534 if (ret > 0) {
2378 bh_result->b_size = (ret << inode->i_blkbits); 2535 bh_result->b_size = (ret << inode->i_blkbits);
2379 ret = 0; 2536 ret = 0;
@@ -2382,10 +2539,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
2382} 2539}
2383 2540
2384/* 2541/*
2385 * get called vi ext4_da_writepages after taking page lock (have journal handle) 2542 * This function can get called via...
2386 * get called via journal_submit_inode_data_buffers (no journal handle) 2543 * - ext4_da_writepages after taking page lock (have journal handle)
2387 * get called via shrink_page_list via pdflush (no journal handle) 2544 * - journal_submit_inode_data_buffers (no journal handle)
2388 * or grab_page_cache when doing write_begin (have journal handle) 2545 * - shrink_page_list via pdflush (no journal handle)
2546 * - grab_page_cache when doing write_begin (have journal handle)
2389 */ 2547 */
2390static int ext4_da_writepage(struct page *page, 2548static int ext4_da_writepage(struct page *page,
2391 struct writeback_control *wbc) 2549 struct writeback_control *wbc)
@@ -2436,7 +2594,7 @@ static int ext4_da_writepage(struct page *page,
2436 * do block allocation here. 2594 * do block allocation here.
2437 */ 2595 */
2438 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2596 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
2439 ext4_normal_get_block_write); 2597 noalloc_get_block_write);
2440 if (!ret) { 2598 if (!ret) {
2441 page_bufs = page_buffers(page); 2599 page_bufs = page_buffers(page);
2442 /* check whether all are mapped and non delay */ 2600 /* check whether all are mapped and non delay */
@@ -2461,11 +2619,10 @@ static int ext4_da_writepage(struct page *page,
2461 } 2619 }
2462 2620
2463 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2621 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2464 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); 2622 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2465 else 2623 else
2466 ret = block_write_full_page(page, 2624 ret = block_write_full_page(page, noalloc_get_block_write,
2467 ext4_normal_get_block_write, 2625 wbc);
2468 wbc);
2469 2626
2470 return ret; 2627 return ret;
2471} 2628}
@@ -2777,7 +2934,7 @@ retry:
2777 *pagep = page; 2934 *pagep = page;
2778 2935
2779 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2936 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2780 ext4_da_get_block_prep); 2937 ext4_da_get_block_prep);
2781 if (ret < 0) { 2938 if (ret < 0) {
2782 unlock_page(page); 2939 unlock_page(page);
2783 ext4_journal_stop(handle); 2940 ext4_journal_stop(handle);
@@ -2815,7 +2972,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
2815 for (i = 0; i < idx; i++) 2972 for (i = 0; i < idx; i++)
2816 bh = bh->b_this_page; 2973 bh = bh->b_this_page;
2817 2974
2818 if (!buffer_mapped(bh) || (buffer_delay(bh))) 2975 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
2819 return 0; 2976 return 0;
2820 return 1; 2977 return 1;
2821} 2978}
@@ -3085,12 +3242,10 @@ static int __ext4_normal_writepage(struct page *page,
3085 struct inode *inode = page->mapping->host; 3242 struct inode *inode = page->mapping->host;
3086 3243
3087 if (test_opt(inode->i_sb, NOBH)) 3244 if (test_opt(inode->i_sb, NOBH))
3088 return nobh_writepage(page, 3245 return nobh_writepage(page, noalloc_get_block_write, wbc);
3089 ext4_normal_get_block_write, wbc);
3090 else 3246 else
3091 return block_write_full_page(page, 3247 return block_write_full_page(page, noalloc_get_block_write,
3092 ext4_normal_get_block_write, 3248 wbc);
3093 wbc);
3094} 3249}
3095 3250
3096static int ext4_normal_writepage(struct page *page, 3251static int ext4_normal_writepage(struct page *page,
@@ -3142,7 +3297,7 @@ static int __ext4_journalled_writepage(struct page *page,
3142 int err; 3297 int err;
3143 3298
3144 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 3299 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
3145 ext4_normal_get_block_write); 3300 noalloc_get_block_write);
3146 if (ret != 0) 3301 if (ret != 0)
3147 goto out_unlock; 3302 goto out_unlock;
3148 3303
@@ -3227,9 +3382,8 @@ static int ext4_journalled_writepage(struct page *page,
3227 * really know unless we go poke around in the buffer_heads. 3382 * really know unless we go poke around in the buffer_heads.
3228 * But block_write_full_page will do the right thing. 3383 * But block_write_full_page will do the right thing.
3229 */ 3384 */
3230 return block_write_full_page(page, 3385 return block_write_full_page(page, noalloc_get_block_write,
3231 ext4_normal_get_block_write, 3386 wbc);
3232 wbc);
3233 } 3387 }
3234no_write: 3388no_write:
3235 redirty_page_for_writepage(wbc, page); 3389 redirty_page_for_writepage(wbc, page);
@@ -3973,7 +4127,8 @@ void ext4_truncate(struct inode *inode)
3973 if (!ext4_can_truncate(inode)) 4127 if (!ext4_can_truncate(inode))
3974 return; 4128 return;
3975 4129
3976 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4130 if (ei->i_disksize && inode->i_size == 0 &&
4131 !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3977 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4132 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3978 4133
3979 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4134 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4715,25 +4870,6 @@ int ext4_write_inode(struct inode *inode, int wait)
4715 return ext4_force_commit(inode->i_sb); 4870 return ext4_force_commit(inode->i_sb);
4716} 4871}
4717 4872
4718int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
4719{
4720 int err = 0;
4721
4722 mark_buffer_dirty(bh);
4723 if (inode && inode_needs_sync(inode)) {
4724 sync_dirty_buffer(bh);
4725 if (buffer_req(bh) && !buffer_uptodate(bh)) {
4726 ext4_error(inode->i_sb, __func__,
4727 "IO error syncing inode, "
4728 "inode=%lu, block=%llu",
4729 inode->i_ino,
4730 (unsigned long long)bh->b_blocknr);
4731 err = -EIO;
4732 }
4733 }
4734 return err;
4735}
4736
4737/* 4873/*
4738 * ext4_setattr() 4874 * ext4_setattr()
4739 * 4875 *
@@ -4930,7 +5066,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4930 */ 5066 */
4931int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5067int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4932{ 5068{
4933 int groups, gdpblocks; 5069 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5070 int gdpblocks;
4934 int idxblocks; 5071 int idxblocks;
4935 int ret = 0; 5072 int ret = 0;
4936 5073
@@ -4957,8 +5094,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4957 groups += nrblocks; 5094 groups += nrblocks;
4958 5095
4959 gdpblocks = groups; 5096 gdpblocks = groups;
4960 if (groups > EXT4_SB(inode->i_sb)->s_groups_count) 5097 if (groups > ngroups)
4961 groups = EXT4_SB(inode->i_sb)->s_groups_count; 5098 groups = ngroups;
4962 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 5099 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4963 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 5100 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4964 5101
@@ -4998,7 +5135,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
4998 * Calculate the journal credits for a chunk of data modification. 5135 * Calculate the journal credits for a chunk of data modification.
4999 * 5136 *
5000 * This is called from DIO, fallocate or whoever calling 5137 * This is called from DIO, fallocate or whoever calling
5001 * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. 5138 * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
5002 * 5139 *
5003 * journal buffers for data blocks are not included here, as DIO 5140 * journal buffers for data blocks are not included here, as DIO
5004 * and fallocate do no need to journal data buffers. 5141 * and fallocate do no need to journal data buffers.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677a7984..ed8482e22c0e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
372 ext4_set_bit(bit, addr); 372 ext4_set_bit(bit, addr);
373} 373}
374 374
375static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
376{
377 addr = mb_correct_addr_and_bit(&bit, addr);
378 ext4_set_bit_atomic(lock, bit, addr);
379}
380
381static inline void mb_clear_bit(int bit, void *addr) 375static inline void mb_clear_bit(int bit, void *addr)
382{ 376{
383 addr = mb_correct_addr_and_bit(&bit, addr); 377 addr = mb_correct_addr_and_bit(&bit, addr);
384 ext4_clear_bit(bit, addr); 378 ext4_clear_bit(bit, addr);
385} 379}
386 380
387static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
388{
389 addr = mb_correct_addr_and_bit(&bit, addr);
390 ext4_clear_bit_atomic(lock, bit, addr);
391}
392
393static inline int mb_find_next_zero_bit(void *addr, int max, int start) 381static inline int mb_find_next_zero_bit(void *addr, int max, int start)
394{ 382{
395 int fix = 0, ret, tmpmax; 383 int fix = 0, ret, tmpmax;
@@ -448,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
448 436
449 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 437 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
450 return; 438 return;
451 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); 439 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
452 for (i = 0; i < count; i++) { 440 for (i = 0; i < count; i++) {
453 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 441 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
454 ext4_fsblk_t blocknr; 442 ext4_fsblk_t blocknr;
@@ -472,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
472 460
473 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 461 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
474 return; 462 return;
475 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 463 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
476 for (i = 0; i < count; i++) { 464 for (i = 0; i < count; i++) {
477 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 465 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
478 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 466 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -739,6 +727,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
739 727
740static int ext4_mb_init_cache(struct page *page, char *incore) 728static int ext4_mb_init_cache(struct page *page, char *incore)
741{ 729{
730 ext4_group_t ngroups;
742 int blocksize; 731 int blocksize;
743 int blocks_per_page; 732 int blocks_per_page;
744 int groups_per_page; 733 int groups_per_page;
@@ -757,6 +746,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
757 746
758 inode = page->mapping->host; 747 inode = page->mapping->host;
759 sb = inode->i_sb; 748 sb = inode->i_sb;
749 ngroups = ext4_get_groups_count(sb);
760 blocksize = 1 << inode->i_blkbits; 750 blocksize = 1 << inode->i_blkbits;
761 blocks_per_page = PAGE_CACHE_SIZE / blocksize; 751 blocks_per_page = PAGE_CACHE_SIZE / blocksize;
762 752
@@ -780,7 +770,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
780 for (i = 0; i < groups_per_page; i++) { 770 for (i = 0; i < groups_per_page; i++) {
781 struct ext4_group_desc *desc; 771 struct ext4_group_desc *desc;
782 772
783 if (first_group + i >= EXT4_SB(sb)->s_groups_count) 773 if (first_group + i >= ngroups)
784 break; 774 break;
785 775
786 err = -EIO; 776 err = -EIO;
@@ -801,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
801 unlock_buffer(bh[i]); 791 unlock_buffer(bh[i]);
802 continue; 792 continue;
803 } 793 }
804 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 794 ext4_lock_group(sb, first_group + i);
805 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 795 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
806 ext4_init_block_bitmap(sb, bh[i], 796 ext4_init_block_bitmap(sb, bh[i],
807 first_group + i, desc); 797 first_group + i, desc);
808 set_bitmap_uptodate(bh[i]); 798 set_bitmap_uptodate(bh[i]);
809 set_buffer_uptodate(bh[i]); 799 set_buffer_uptodate(bh[i]);
810 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 800 ext4_unlock_group(sb, first_group + i);
811 unlock_buffer(bh[i]); 801 unlock_buffer(bh[i]);
812 continue; 802 continue;
813 } 803 }
814 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 804 ext4_unlock_group(sb, first_group + i);
815 if (buffer_uptodate(bh[i])) { 805 if (buffer_uptodate(bh[i])) {
816 /* 806 /*
817 * if not uninit if bh is uptodate, 807 * if not uninit if bh is uptodate,
@@ -852,7 +842,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
852 struct ext4_group_info *grinfo; 842 struct ext4_group_info *grinfo;
853 843
854 group = (first_block + i) >> 1; 844 group = (first_block + i) >> 1;
855 if (group >= EXT4_SB(sb)->s_groups_count) 845 if (group >= ngroups)
856 break; 846 break;
857 847
858 /* 848 /*
@@ -1078,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1078 return 0; 1068 return 0;
1079} 1069}
1080 1070
1081static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) 1071static void mb_clear_bits(void *bm, int cur, int len)
1082{ 1072{
1083 __u32 *addr; 1073 __u32 *addr;
1084 1074
@@ -1091,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1091 cur += 32; 1081 cur += 32;
1092 continue; 1082 continue;
1093 } 1083 }
1094 if (lock) 1084 mb_clear_bit(cur, bm);
1095 mb_clear_bit_atomic(lock, cur, bm);
1096 else
1097 mb_clear_bit(cur, bm);
1098 cur++; 1085 cur++;
1099 } 1086 }
1100} 1087}
1101 1088
1102static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) 1089static void mb_set_bits(void *bm, int cur, int len)
1103{ 1090{
1104 __u32 *addr; 1091 __u32 *addr;
1105 1092
@@ -1112,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1112 cur += 32; 1099 cur += 32;
1113 continue; 1100 continue;
1114 } 1101 }
1115 if (lock) 1102 mb_set_bit(cur, bm);
1116 mb_set_bit_atomic(lock, cur, bm);
1117 else
1118 mb_set_bit(cur, bm);
1119 cur++; 1103 cur++;
1120 } 1104 }
1121} 1105}
@@ -1131,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1131 struct super_block *sb = e4b->bd_sb; 1115 struct super_block *sb = e4b->bd_sb;
1132 1116
1133 BUG_ON(first + count > (sb->s_blocksize << 3)); 1117 BUG_ON(first + count > (sb->s_blocksize << 3));
1134 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); 1118 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1135 mb_check_buddy(e4b); 1119 mb_check_buddy(e4b);
1136 mb_free_blocks_double(inode, e4b, first, count); 1120 mb_free_blocks_double(inode, e4b, first, count);
1137 1121
@@ -1212,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1212 int ord; 1196 int ord;
1213 void *buddy; 1197 void *buddy;
1214 1198
1215 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 1199 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1216 BUG_ON(ex == NULL); 1200 BUG_ON(ex == NULL);
1217 1201
1218 buddy = mb_find_buddy(e4b, order, &max); 1202 buddy = mb_find_buddy(e4b, order, &max);
@@ -1276,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1276 1260
1277 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1261 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1278 BUG_ON(e4b->bd_group != ex->fe_group); 1262 BUG_ON(e4b->bd_group != ex->fe_group);
1279 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 1263 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1280 mb_check_buddy(e4b); 1264 mb_check_buddy(e4b);
1281 mb_mark_used_double(e4b, start, len); 1265 mb_mark_used_double(e4b, start, len);
1282 1266
@@ -1330,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1330 e4b->bd_info->bb_counters[ord]++; 1314 e4b->bd_info->bb_counters[ord]++;
1331 } 1315 }
1332 1316
1333 mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), 1317 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1334 EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1335 mb_check_buddy(e4b); 1318 mb_check_buddy(e4b);
1336 1319
1337 return ret; 1320 return ret;
@@ -1726,7 +1709,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1726 unsigned free, fragments; 1709 unsigned free, fragments;
1727 unsigned i, bits; 1710 unsigned i, bits;
1728 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1711 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1729 struct ext4_group_desc *desc;
1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1712 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1731 1713
1732 BUG_ON(cr < 0 || cr >= 4); 1714 BUG_ON(cr < 0 || cr >= 4);
@@ -1742,10 +1724,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1742 switch (cr) { 1724 switch (cr) {
1743 case 0: 1725 case 0:
1744 BUG_ON(ac->ac_2order == 0); 1726 BUG_ON(ac->ac_2order == 0);
1745 /* If this group is uninitialized, skip it initially */
1746 desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1748 return 0;
1749 1727
1750 /* Avoid using the first bg of a flexgroup for data files */ 1728 /* Avoid using the first bg of a flexgroup for data files */
1751 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1729 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -1788,6 +1766,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1788 int block, pnum; 1766 int block, pnum;
1789 int blocks_per_page; 1767 int blocks_per_page;
1790 int groups_per_page; 1768 int groups_per_page;
1769 ext4_group_t ngroups = ext4_get_groups_count(sb);
1791 ext4_group_t first_group; 1770 ext4_group_t first_group;
1792 struct ext4_group_info *grp; 1771 struct ext4_group_info *grp;
1793 1772
@@ -1807,7 +1786,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1807 /* read all groups the page covers into the cache */ 1786 /* read all groups the page covers into the cache */
1808 for (i = 0; i < groups_per_page; i++) { 1787 for (i = 0; i < groups_per_page; i++) {
1809 1788
1810 if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) 1789 if ((first_group + i) >= ngroups)
1811 break; 1790 break;
1812 grp = ext4_get_group_info(sb, first_group + i); 1791 grp = ext4_get_group_info(sb, first_group + i);
1813 /* take all groups write allocation 1792 /* take all groups write allocation
@@ -1945,8 +1924,7 @@ err:
1945static noinline_for_stack int 1924static noinline_for_stack int
1946ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1925ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1947{ 1926{
1948 ext4_group_t group; 1927 ext4_group_t ngroups, group, i;
1949 ext4_group_t i;
1950 int cr; 1928 int cr;
1951 int err = 0; 1929 int err = 0;
1952 int bsbits; 1930 int bsbits;
@@ -1957,6 +1935,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1957 1935
1958 sb = ac->ac_sb; 1936 sb = ac->ac_sb;
1959 sbi = EXT4_SB(sb); 1937 sbi = EXT4_SB(sb);
1938 ngroups = ext4_get_groups_count(sb);
1960 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1939 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1961 1940
1962 /* first, try the goal */ 1941 /* first, try the goal */
@@ -2017,11 +1996,11 @@ repeat:
2017 */ 1996 */
2018 group = ac->ac_g_ex.fe_group; 1997 group = ac->ac_g_ex.fe_group;
2019 1998
2020 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { 1999 for (i = 0; i < ngroups; group++, i++) {
2021 struct ext4_group_info *grp; 2000 struct ext4_group_info *grp;
2022 struct ext4_group_desc *desc; 2001 struct ext4_group_desc *desc;
2023 2002
2024 if (group == EXT4_SB(sb)->s_groups_count) 2003 if (group == ngroups)
2025 group = 0; 2004 group = 0;
2026 2005
2027 /* quick check to skip empty groups */ 2006 /* quick check to skip empty groups */
@@ -2064,9 +2043,7 @@ repeat:
2064 2043
2065 ac->ac_groups_scanned++; 2044 ac->ac_groups_scanned++;
2066 desc = ext4_get_group_desc(sb, group, NULL); 2045 desc = ext4_get_group_desc(sb, group, NULL);
2067 if (cr == 0 || (desc->bg_flags & 2046 if (cr == 0)
2068 cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
2069 ac->ac_2order != 0))
2070 ext4_mb_simple_scan_group(ac, &e4b); 2047 ext4_mb_simple_scan_group(ac, &e4b);
2071 else if (cr == 1 && 2048 else if (cr == 1 &&
2072 ac->ac_g_ex.fe_len == sbi->s_stripe) 2049 ac->ac_g_ex.fe_len == sbi->s_stripe)
@@ -2315,12 +2292,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
2315static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2292static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2316{ 2293{
2317 struct super_block *sb = seq->private; 2294 struct super_block *sb = seq->private;
2318 struct ext4_sb_info *sbi = EXT4_SB(sb);
2319 ext4_group_t group; 2295 ext4_group_t group;
2320 2296
2321 if (*pos < 0 || *pos >= sbi->s_groups_count) 2297 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2322 return NULL; 2298 return NULL;
2323
2324 group = *pos + 1; 2299 group = *pos + 1;
2325 return (void *) ((unsigned long) group); 2300 return (void *) ((unsigned long) group);
2326} 2301}
@@ -2328,11 +2303,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2328static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2303static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2329{ 2304{
2330 struct super_block *sb = seq->private; 2305 struct super_block *sb = seq->private;
2331 struct ext4_sb_info *sbi = EXT4_SB(sb);
2332 ext4_group_t group; 2306 ext4_group_t group;
2333 2307
2334 ++*pos; 2308 ++*pos;
2335 if (*pos < 0 || *pos >= sbi->s_groups_count) 2309 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2336 return NULL; 2310 return NULL;
2337 group = *pos + 1; 2311 group = *pos + 1;
2338 return (void *) ((unsigned long) group); 2312 return (void *) ((unsigned long) group);
@@ -2420,7 +2394,8 @@ static void ext4_mb_history_release(struct super_block *sb)
2420 2394
2421 if (sbi->s_proc != NULL) { 2395 if (sbi->s_proc != NULL) {
2422 remove_proc_entry("mb_groups", sbi->s_proc); 2396 remove_proc_entry("mb_groups", sbi->s_proc);
2423 remove_proc_entry("mb_history", sbi->s_proc); 2397 if (sbi->s_mb_history_max)
2398 remove_proc_entry("mb_history", sbi->s_proc);
2424 } 2399 }
2425 kfree(sbi->s_mb_history); 2400 kfree(sbi->s_mb_history);
2426} 2401}
@@ -2431,17 +2406,17 @@ static void ext4_mb_history_init(struct super_block *sb)
2431 int i; 2406 int i;
2432 2407
2433 if (sbi->s_proc != NULL) { 2408 if (sbi->s_proc != NULL) {
2434 proc_create_data("mb_history", S_IRUGO, sbi->s_proc, 2409 if (sbi->s_mb_history_max)
2435 &ext4_mb_seq_history_fops, sb); 2410 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2411 &ext4_mb_seq_history_fops, sb);
2436 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2412 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2437 &ext4_mb_seq_groups_fops, sb); 2413 &ext4_mb_seq_groups_fops, sb);
2438 } 2414 }
2439 2415
2440 sbi->s_mb_history_max = 1000;
2441 sbi->s_mb_history_cur = 0; 2416 sbi->s_mb_history_cur = 0;
2442 spin_lock_init(&sbi->s_mb_history_lock); 2417 spin_lock_init(&sbi->s_mb_history_lock);
2443 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); 2418 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2444 sbi->s_mb_history = kzalloc(i, GFP_KERNEL); 2419 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
2445 /* if we can't allocate history, then we simple won't use it */ 2420 /* if we can't allocate history, then we simple won't use it */
2446} 2421}
2447 2422
@@ -2451,7 +2426,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
2451 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2426 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2452 struct ext4_mb_history h; 2427 struct ext4_mb_history h;
2453 2428
2454 if (unlikely(sbi->s_mb_history == NULL)) 2429 if (sbi->s_mb_history == NULL)
2455 return; 2430 return;
2456 2431
2457 if (!(ac->ac_op & sbi->s_mb_history_filter)) 2432 if (!(ac->ac_op & sbi->s_mb_history_filter))
@@ -2587,6 +2562,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2587 2562
2588static int ext4_mb_init_backend(struct super_block *sb) 2563static int ext4_mb_init_backend(struct super_block *sb)
2589{ 2564{
2565 ext4_group_t ngroups = ext4_get_groups_count(sb);
2590 ext4_group_t i; 2566 ext4_group_t i;
2591 int metalen; 2567 int metalen;
2592 struct ext4_sb_info *sbi = EXT4_SB(sb); 2568 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2574,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2598 struct ext4_group_desc *desc; 2574 struct ext4_group_desc *desc;
2599 2575
2600 /* This is the number of blocks used by GDT */ 2576 /* This is the number of blocks used by GDT */
2601 num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 2577 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
2602 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2578 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
2603 2579
2604 /* 2580 /*
@@ -2644,7 +2620,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2644 for (i = 0; i < num_meta_group_infos; i++) { 2620 for (i = 0; i < num_meta_group_infos; i++) {
2645 if ((i + 1) == num_meta_group_infos) 2621 if ((i + 1) == num_meta_group_infos)
2646 metalen = sizeof(*meta_group_info) * 2622 metalen = sizeof(*meta_group_info) *
2647 (sbi->s_groups_count - 2623 (ngroups -
2648 (i << EXT4_DESC_PER_BLOCK_BITS(sb))); 2624 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2649 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2625 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2650 if (meta_group_info == NULL) { 2626 if (meta_group_info == NULL) {
@@ -2655,7 +2631,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2655 sbi->s_group_info[i] = meta_group_info; 2631 sbi->s_group_info[i] = meta_group_info;
2656 } 2632 }
2657 2633
2658 for (i = 0; i < sbi->s_groups_count; i++) { 2634 for (i = 0; i < ngroups; i++) {
2659 desc = ext4_get_group_desc(sb, i, NULL); 2635 desc = ext4_get_group_desc(sb, i, NULL);
2660 if (desc == NULL) { 2636 if (desc == NULL) {
2661 printk(KERN_ERR 2637 printk(KERN_ERR
@@ -2761,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2761 return 0; 2737 return 0;
2762} 2738}
2763 2739
2764/* need to called with ext4 group lock (ext4_lock_group) */ 2740/* need to called with the ext4 group lock held */
2765static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) 2741static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2766{ 2742{
2767 struct ext4_prealloc_space *pa; 2743 struct ext4_prealloc_space *pa;
@@ -2781,13 +2757,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2781 2757
2782int ext4_mb_release(struct super_block *sb) 2758int ext4_mb_release(struct super_block *sb)
2783{ 2759{
2760 ext4_group_t ngroups = ext4_get_groups_count(sb);
2784 ext4_group_t i; 2761 ext4_group_t i;
2785 int num_meta_group_infos; 2762 int num_meta_group_infos;
2786 struct ext4_group_info *grinfo; 2763 struct ext4_group_info *grinfo;
2787 struct ext4_sb_info *sbi = EXT4_SB(sb); 2764 struct ext4_sb_info *sbi = EXT4_SB(sb);
2788 2765
2789 if (sbi->s_group_info) { 2766 if (sbi->s_group_info) {
2790 for (i = 0; i < sbi->s_groups_count; i++) { 2767 for (i = 0; i < ngroups; i++) {
2791 grinfo = ext4_get_group_info(sb, i); 2768 grinfo = ext4_get_group_info(sb, i);
2792#ifdef DOUBLE_CHECK 2769#ifdef DOUBLE_CHECK
2793 kfree(grinfo->bb_bitmap); 2770 kfree(grinfo->bb_bitmap);
@@ -2797,7 +2774,7 @@ int ext4_mb_release(struct super_block *sb)
2797 ext4_unlock_group(sb, i); 2774 ext4_unlock_group(sb, i);
2798 kfree(grinfo); 2775 kfree(grinfo);
2799 } 2776 }
2800 num_meta_group_infos = (sbi->s_groups_count + 2777 num_meta_group_infos = (ngroups +
2801 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2778 EXT4_DESC_PER_BLOCK(sb) - 1) >>
2802 EXT4_DESC_PER_BLOCK_BITS(sb); 2779 EXT4_DESC_PER_BLOCK_BITS(sb);
2803 for (i = 0; i < num_meta_group_infos; i++) 2780 for (i = 0; i < num_meta_group_infos; i++)
@@ -2984,27 +2961,25 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2984 + le32_to_cpu(es->s_first_data_block); 2961 + le32_to_cpu(es->s_first_data_block);
2985 2962
2986 len = ac->ac_b_ex.fe_len; 2963 len = ac->ac_b_ex.fe_len;
2987 if (in_range(ext4_block_bitmap(sb, gdp), block, len) || 2964 if (!ext4_data_block_valid(sbi, block, len)) {
2988 in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
2989 in_range(block, ext4_inode_table(sb, gdp),
2990 EXT4_SB(sb)->s_itb_per_group) ||
2991 in_range(block + len - 1, ext4_inode_table(sb, gdp),
2992 EXT4_SB(sb)->s_itb_per_group)) {
2993 ext4_error(sb, __func__, 2965 ext4_error(sb, __func__,
2994 "Allocating block %llu in system zone of %d group\n", 2966 "Allocating blocks %llu-%llu which overlap "
2995 block, ac->ac_b_ex.fe_group); 2967 "fs metadata\n", block, block+len);
2996 /* File system mounted not to panic on error 2968 /* File system mounted not to panic on error
2997 * Fix the bitmap and repeat the block allocation 2969 * Fix the bitmap and repeat the block allocation
2998 * We leak some of the blocks here. 2970 * We leak some of the blocks here.
2999 */ 2971 */
3000 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), 2972 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3001 bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2973 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3002 ac->ac_b_ex.fe_len); 2974 ac->ac_b_ex.fe_len);
2975 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3003 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2976 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3004 if (!err) 2977 if (!err)
3005 err = -EAGAIN; 2978 err = -EAGAIN;
3006 goto out_err; 2979 goto out_err;
3007 } 2980 }
2981
2982 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3008#ifdef AGGRESSIVE_CHECK 2983#ifdef AGGRESSIVE_CHECK
3009 { 2984 {
3010 int i; 2985 int i;
@@ -3014,9 +2989,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3014 } 2989 }
3015 } 2990 }
3016#endif 2991#endif
3017 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2992 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
3018 mb_set_bits(NULL, bitmap_bh->b_data,
3019 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
3020 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2993 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
3021 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 2994 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3022 ext4_free_blks_set(sb, gdp, 2995 ext4_free_blks_set(sb, gdp,
@@ -3026,7 +2999,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3026 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; 2999 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
3027 ext4_free_blks_set(sb, gdp, len); 3000 ext4_free_blks_set(sb, gdp, len);
3028 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 3001 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
3029 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3002
3003 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3030 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 3004 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
3031 /* 3005 /*
3032 * Now reduce the dirty block count also. Should not go negative 3006 * Now reduce the dirty block count also. Should not go negative
@@ -3459,7 +3433,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3459 * the function goes through all block freed in the group 3433 * the function goes through all block freed in the group
3460 * but not yet committed and marks them used in in-core bitmap. 3434 * but not yet committed and marks them used in in-core bitmap.
3461 * buddy must be generated from this bitmap 3435 * buddy must be generated from this bitmap
3462 * Need to be called with ext4 group lock (ext4_lock_group) 3436 * Need to be called with the ext4 group lock held
3463 */ 3437 */
3464static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 3438static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3465 ext4_group_t group) 3439 ext4_group_t group)
@@ -3473,9 +3447,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3473 3447
3474 while (n) { 3448 while (n) {
3475 entry = rb_entry(n, struct ext4_free_data, node); 3449 entry = rb_entry(n, struct ext4_free_data, node);
3476 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), 3450 mb_set_bits(bitmap, entry->start_blk, entry->count);
3477 bitmap, entry->start_blk,
3478 entry->count);
3479 n = rb_next(n); 3451 n = rb_next(n);
3480 } 3452 }
3481 return; 3453 return;
@@ -3484,7 +3456,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3484/* 3456/*
3485 * the function goes through all preallocation in this group and marks them 3457 * the function goes through all preallocation in this group and marks them
3486 * used in in-core bitmap. buddy must be generated from this bitmap 3458 * used in in-core bitmap. buddy must be generated from this bitmap
3487 * Need to be called with ext4 group lock (ext4_lock_group) 3459 * Need to be called with ext4 group lock held
3488 */ 3460 */
3489static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3461static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3490 ext4_group_t group) 3462 ext4_group_t group)
@@ -3516,8 +3488,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3516 if (unlikely(len == 0)) 3488 if (unlikely(len == 0))
3517 continue; 3489 continue;
3518 BUG_ON(groupnr != group); 3490 BUG_ON(groupnr != group);
3519 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), 3491 mb_set_bits(bitmap, start, len);
3520 bitmap, start, len);
3521 preallocated += len; 3492 preallocated += len;
3522 count++; 3493 count++;
3523 } 3494 }
@@ -4121,7 +4092,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
4121static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4092static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4122{ 4093{
4123 struct super_block *sb = ac->ac_sb; 4094 struct super_block *sb = ac->ac_sb;
4124 ext4_group_t i; 4095 ext4_group_t ngroups, i;
4125 4096
4126 printk(KERN_ERR "EXT4-fs: Can't allocate:" 4097 printk(KERN_ERR "EXT4-fs: Can't allocate:"
4127 " Allocation context details:\n"); 4098 " Allocation context details:\n");
@@ -4145,7 +4116,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4145 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, 4116 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
4146 ac->ac_found); 4117 ac->ac_found);
4147 printk(KERN_ERR "EXT4-fs: groups: \n"); 4118 printk(KERN_ERR "EXT4-fs: groups: \n");
4148 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 4119 ngroups = ext4_get_groups_count(sb);
4120 for (i = 0; i < ngroups; i++) {
4149 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 4121 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4150 struct ext4_prealloc_space *pa; 4122 struct ext4_prealloc_space *pa;
4151 ext4_grpblk_t start; 4123 ext4_grpblk_t start;
@@ -4469,13 +4441,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4469 4441
4470static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 4442static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4471{ 4443{
4472 ext4_group_t i; 4444 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4473 int ret; 4445 int ret;
4474 int freed = 0; 4446 int freed = 0;
4475 4447
4476 trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", 4448 trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
4477 sb->s_id, needed); 4449 sb->s_id, needed);
4478 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { 4450 for (i = 0; i < ngroups && needed > 0; i++) {
4479 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4451 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4480 freed += ret; 4452 freed += ret;
4481 needed -= ret; 4453 needed -= ret;
@@ -4859,29 +4831,25 @@ do_more:
4859 new_entry->group = block_group; 4831 new_entry->group = block_group;
4860 new_entry->count = count; 4832 new_entry->count = count;
4861 new_entry->t_tid = handle->h_transaction->t_tid; 4833 new_entry->t_tid = handle->h_transaction->t_tid;
4834
4862 ext4_lock_group(sb, block_group); 4835 ext4_lock_group(sb, block_group);
4863 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, 4836 mb_clear_bits(bitmap_bh->b_data, bit, count);
4864 bit, count);
4865 ext4_mb_free_metadata(handle, &e4b, new_entry); 4837 ext4_mb_free_metadata(handle, &e4b, new_entry);
4866 ext4_unlock_group(sb, block_group);
4867 } else { 4838 } else {
4868 ext4_lock_group(sb, block_group);
4869 /* need to update group_info->bb_free and bitmap 4839 /* need to update group_info->bb_free and bitmap
4870 * with group lock held. generate_buddy look at 4840 * with group lock held. generate_buddy look at
4871 * them with group lock_held 4841 * them with group lock_held
4872 */ 4842 */
4873 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, 4843 ext4_lock_group(sb, block_group);
4874 bit, count); 4844 mb_clear_bits(bitmap_bh->b_data, bit, count);
4875 mb_free_blocks(inode, &e4b, bit, count); 4845 mb_free_blocks(inode, &e4b, bit, count);
4876 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4846 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4877 ext4_unlock_group(sb, block_group);
4878 } 4847 }
4879 4848
4880 spin_lock(sb_bgl_lock(sbi, block_group));
4881 ret = ext4_free_blks_count(sb, gdp) + count; 4849 ret = ext4_free_blks_count(sb, gdp) + count;
4882 ext4_free_blks_set(sb, gdp, ret); 4850 ext4_free_blks_set(sb, gdp, ret);
4883 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4851 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4884 spin_unlock(sb_bgl_lock(sbi, block_group)); 4852 ext4_unlock_group(sb, block_group);
4885 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4853 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4886 4854
4887 if (sbi->s_log_groups_per_flex) { 4855 if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd9e6cd5f6cf..75e34f69215b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,7 +23,6 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include "ext4_jbd2.h" 24#include "ext4_jbd2.h"
25#include "ext4.h" 25#include "ext4.h"
26#include "group.h"
27 26
28/* 27/*
29 * with AGGRESSIVE_CHECK allocator runs consistency checks over 28 * with AGGRESSIVE_CHECK allocator runs consistency checks over
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 22098e1cd085..07eb6649e4fa 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -37,7 +37,6 @@
37#include "ext4.h" 37#include "ext4.h"
38#include "ext4_jbd2.h" 38#include "ext4_jbd2.h"
39 39
40#include "namei.h"
41#include "xattr.h" 40#include "xattr.h"
42#include "acl.h" 41#include "acl.h"
43 42
@@ -750,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
750 ext4fs_dirhash(de->name, de->name_len, &h); 749 ext4fs_dirhash(de->name, de->name_len, &h);
751 map_tail--; 750 map_tail--;
752 map_tail->hash = h.hash; 751 map_tail->hash = h.hash;
753 map_tail->offs = (u16) ((char *) de - base); 752 map_tail->offs = ((char *) de - base)>>2;
754 map_tail->size = le16_to_cpu(de->rec_len); 753 map_tail->size = le16_to_cpu(de->rec_len);
755 count++; 754 count++;
756 cond_resched(); 755 cond_resched();
@@ -1148,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1148 unsigned rec_len = 0; 1147 unsigned rec_len = 0;
1149 1148
1150 while (count--) { 1149 while (count--) {
1151 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs); 1150 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1151 (from + (map->offs<<2));
1152 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1152 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1153 memcpy (to, de, rec_len); 1153 memcpy (to, de, rec_len);
1154 ((struct ext4_dir_entry_2 *) to)->rec_len = 1154 ((struct ext4_dir_entry_2 *) to)->rec_len =
@@ -1997,7 +1997,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1997 if (!ext4_handle_valid(handle)) 1997 if (!ext4_handle_valid(handle))
1998 return 0; 1998 return 0;
1999 1999
2000 lock_super(sb); 2000 mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
2001 if (!list_empty(&EXT4_I(inode)->i_orphan)) 2001 if (!list_empty(&EXT4_I(inode)->i_orphan))
2002 goto out_unlock; 2002 goto out_unlock;
2003 2003
@@ -2006,9 +2006,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2006 2006
2007 /* @@@ FIXME: Observation from aviro: 2007 /* @@@ FIXME: Observation from aviro:
2008 * I think I can trigger J_ASSERT in ext4_orphan_add(). We block 2008 * I think I can trigger J_ASSERT in ext4_orphan_add(). We block
2009 * here (on lock_super()), so race with ext4_link() which might bump 2009 * here (on s_orphan_lock), so race with ext4_link() which might bump
2010 * ->i_nlink. For, say it, character device. Not a regular file, 2010 * ->i_nlink. For, say it, character device. Not a regular file,
2011 * not a directory, not a symlink and ->i_nlink > 0. 2011 * not a directory, not a symlink and ->i_nlink > 0.
2012 *
2013 * tytso, 4/25/2009: I'm not sure how that could happen;
2014 * shouldn't the fs core protect us from these sort of
2015 * unlink()/link() races?
2012 */ 2016 */
2013 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2017 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2014 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 2018 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2049,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2045 jbd_debug(4, "orphan inode %lu will point to %d\n", 2049 jbd_debug(4, "orphan inode %lu will point to %d\n",
2046 inode->i_ino, NEXT_ORPHAN(inode)); 2050 inode->i_ino, NEXT_ORPHAN(inode));
2047out_unlock: 2051out_unlock:
2048 unlock_super(sb); 2052 mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
2049 ext4_std_error(inode->i_sb, err); 2053 ext4_std_error(inode->i_sb, err);
2050 return err; 2054 return err;
2051} 2055}
@@ -2066,11 +2070,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2066 if (!ext4_handle_valid(handle)) 2070 if (!ext4_handle_valid(handle))
2067 return 0; 2071 return 0;
2068 2072
2069 lock_super(inode->i_sb); 2073 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2070 if (list_empty(&ei->i_orphan)) { 2074 if (list_empty(&ei->i_orphan))
2071 unlock_super(inode->i_sb); 2075 goto out;
2072 return 0;
2073 }
2074 2076
2075 ino_next = NEXT_ORPHAN(inode); 2077 ino_next = NEXT_ORPHAN(inode);
2076 prev = ei->i_orphan.prev; 2078 prev = ei->i_orphan.prev;
@@ -2120,7 +2122,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2120out_err: 2122out_err:
2121 ext4_std_error(inode->i_sb, err); 2123 ext4_std_error(inode->i_sb, err);
2122out: 2124out:
2123 unlock_super(inode->i_sb); 2125 mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2124 return err; 2126 return err;
2125 2127
2126out_brelse: 2128out_brelse:
@@ -2533,6 +2535,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2533 .removexattr = generic_removexattr, 2535 .removexattr = generic_removexattr,
2534#endif 2536#endif
2535 .permission = ext4_permission, 2537 .permission = ext4_permission,
2538 .fiemap = ext4_fiemap,
2536}; 2539};
2537 2540
2538const struct inode_operations ext4_special_inode_operations = { 2541const struct inode_operations ext4_special_inode_operations = {
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a00..000000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
1/* linux/fs/ext4/namei.h
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 546c7dd869e1..27eb289eea37 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -15,7 +15,6 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
18#include "group.h"
19 18
20#define outside(b, first, last) ((b) < (first) || (b) >= (last)) 19#define outside(b, first, last) ((b) < (first) || (b) >= (last))
21#define inside(b, first, last) ((b) >= (first) && (b) < (last)) 20#define inside(b, first, last) ((b) >= (first) && (b) < (last))
@@ -193,7 +192,7 @@ static int setup_new_group_blocks(struct super_block *sb,
193 if (IS_ERR(handle)) 192 if (IS_ERR(handle))
194 return PTR_ERR(handle); 193 return PTR_ERR(handle);
195 194
196 lock_super(sb); 195 mutex_lock(&sbi->s_resize_lock);
197 if (input->group != sbi->s_groups_count) { 196 if (input->group != sbi->s_groups_count) {
198 err = -EBUSY; 197 err = -EBUSY;
199 goto exit_journal; 198 goto exit_journal;
@@ -302,7 +301,7 @@ exit_bh:
302 brelse(bh); 301 brelse(bh);
303 302
304exit_journal: 303exit_journal:
305 unlock_super(sb); 304 mutex_unlock(&sbi->s_resize_lock);
306 if ((err2 = ext4_journal_stop(handle)) && !err) 305 if ((err2 = ext4_journal_stop(handle)) && !err)
307 err = err2; 306 err = err2;
308 307
@@ -643,11 +642,12 @@ exit_free:
643 * important part is that the new block and inode counts are in the backup 642 * important part is that the new block and inode counts are in the backup
644 * superblocks, and the location of the new group metadata in the GDT backups. 643 * superblocks, and the location of the new group metadata in the GDT backups.
645 * 644 *
646 * We do not need lock_super() for this, because these blocks are not 645 * We do not need take the s_resize_lock for this, because these
647 * otherwise touched by the filesystem code when it is mounted. We don't 646 * blocks are not otherwise touched by the filesystem code when it is
648 * need to worry about last changing from sbi->s_groups_count, because the 647 * mounted. We don't need to worry about last changing from
649 * worst that can happen is that we do not copy the full number of backups 648 * sbi->s_groups_count, because the worst that can happen is that we
650 * at this time. The resize which changed s_groups_count will backup again. 649 * do not copy the full number of backups at this time. The resize
650 * which changed s_groups_count will backup again.
651 */ 651 */
652static void update_backups(struct super_block *sb, 652static void update_backups(struct super_block *sb,
653 int blk_off, char *data, int size) 653 int blk_off, char *data, int size)
@@ -809,7 +809,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
809 goto exit_put; 809 goto exit_put;
810 } 810 }
811 811
812 lock_super(sb); 812 mutex_lock(&sbi->s_resize_lock);
813 if (input->group != sbi->s_groups_count) { 813 if (input->group != sbi->s_groups_count) {
814 ext4_warning(sb, __func__, 814 ext4_warning(sb, __func__,
815 "multiple resizers run on filesystem!"); 815 "multiple resizers run on filesystem!");
@@ -840,7 +840,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
840 /* 840 /*
841 * OK, now we've set up the new group. Time to make it active. 841 * OK, now we've set up the new group. Time to make it active.
842 * 842 *
843 * Current kernels don't lock all allocations via lock_super(), 843 * We do not lock all allocations via s_resize_lock
844 * so we have to be safe wrt. concurrent accesses the group 844 * so we have to be safe wrt. concurrent accesses the group
845 * data. So we need to be careful to set all of the relevant 845 * data. So we need to be careful to set all of the relevant
846 * group descriptor data etc. *before* we enable the group. 846 * group descriptor data etc. *before* we enable the group.
@@ -900,12 +900,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
900 * 900 *
901 * The precise rules we use are: 901 * The precise rules we use are:
902 * 902 *
903 * * Writers of s_groups_count *must* hold lock_super 903 * * Writers of s_groups_count *must* hold s_resize_lock
904 * AND 904 * AND
905 * * Writers must perform a smp_wmb() after updating all dependent 905 * * Writers must perform a smp_wmb() after updating all dependent
906 * data and before modifying the groups count 906 * data and before modifying the groups count
907 * 907 *
908 * * Readers must hold lock_super() over the access 908 * * Readers must hold s_resize_lock over the access
909 * OR 909 * OR
910 * * Readers must perform an smp_rmb() after reading the groups count 910 * * Readers must perform an smp_rmb() after reading the groups count
911 * and before reading any dependent data. 911 * and before reading any dependent data.
@@ -948,7 +948,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
948 sb->s_dirt = 1; 948 sb->s_dirt = 1;
949 949
950exit_journal: 950exit_journal:
951 unlock_super(sb); 951 mutex_unlock(&sbi->s_resize_lock);
952 if ((err2 = ext4_journal_stop(handle)) && !err) 952 if ((err2 = ext4_journal_stop(handle)) && !err)
953 err = err2; 953 err = err2;
954 if (!err) { 954 if (!err) {
@@ -986,7 +986,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
986 986
987 /* We don't need to worry about locking wrt other resizers just 987 /* We don't need to worry about locking wrt other resizers just
988 * yet: we're going to revalidate es->s_blocks_count after 988 * yet: we're going to revalidate es->s_blocks_count after
989 * taking lock_super() below. */ 989 * taking the s_resize_lock below. */
990 o_blocks_count = ext4_blocks_count(es); 990 o_blocks_count = ext4_blocks_count(es);
991 o_groups_count = EXT4_SB(sb)->s_groups_count; 991 o_groups_count = EXT4_SB(sb)->s_groups_count;
992 992
@@ -1056,11 +1056,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1056 goto exit_put; 1056 goto exit_put;
1057 } 1057 }
1058 1058
1059 lock_super(sb); 1059 mutex_lock(&EXT4_SB(sb)->s_resize_lock);
1060 if (o_blocks_count != ext4_blocks_count(es)) { 1060 if (o_blocks_count != ext4_blocks_count(es)) {
1061 ext4_warning(sb, __func__, 1061 ext4_warning(sb, __func__,
1062 "multiple resizers run on filesystem!"); 1062 "multiple resizers run on filesystem!");
1063 unlock_super(sb); 1063 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1064 ext4_journal_stop(handle); 1064 ext4_journal_stop(handle);
1065 err = -EBUSY; 1065 err = -EBUSY;
1066 goto exit_put; 1066 goto exit_put;
@@ -1070,14 +1070,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1070 EXT4_SB(sb)->s_sbh))) { 1070 EXT4_SB(sb)->s_sbh))) {
1071 ext4_warning(sb, __func__, 1071 ext4_warning(sb, __func__,
1072 "error %d on journal write access", err); 1072 "error %d on journal write access", err);
1073 unlock_super(sb); 1073 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1074 ext4_journal_stop(handle); 1074 ext4_journal_stop(handle);
1075 goto exit_put; 1075 goto exit_put;
1076 } 1076 }
1077 ext4_blocks_count_set(es, o_blocks_count + add); 1077 ext4_blocks_count_set(es, o_blocks_count + add);
1078 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 1078 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
1079 sb->s_dirt = 1; 1079 sb->s_dirt = 1;
1080 unlock_super(sb); 1080 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1081 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1081 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1082 o_blocks_count + add); 1082 o_blocks_count + add);
1083 /* We add the blocks to the bitmap and set the group need init bit */ 1083 /* We add the blocks to the bitmap and set the group need init bit */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f222..012c4251397e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
20#include <linux/string.h> 20#include <linux/string.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/time.h> 22#include <linux/time.h>
23#include <linux/vmalloc.h>
23#include <linux/jbd2.h> 24#include <linux/jbd2.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
25#include <linux/init.h> 26#include <linux/init.h>
@@ -45,16 +46,20 @@
45#include "ext4_jbd2.h" 46#include "ext4_jbd2.h"
46#include "xattr.h" 47#include "xattr.h"
47#include "acl.h" 48#include "acl.h"
48#include "namei.h" 49
49#include "group.h" 50static int default_mb_history_length = 1000;
51
52module_param_named(default_mb_history_length, default_mb_history_length,
53 int, 0644);
54MODULE_PARM_DESC(default_mb_history_length,
55 "Default number of entries saved for mb_history");
50 56
51struct proc_dir_entry *ext4_proc_root; 57struct proc_dir_entry *ext4_proc_root;
52static struct kset *ext4_kset; 58static struct kset *ext4_kset;
53 59
54static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 60static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
55 unsigned long journal_devnum); 61 unsigned long journal_devnum);
56static int ext4_commit_super(struct super_block *sb, 62static int ext4_commit_super(struct super_block *sb, int sync);
57 struct ext4_super_block *es, int sync);
58static void ext4_mark_recovery_complete(struct super_block *sb, 63static void ext4_mark_recovery_complete(struct super_block *sb,
59 struct ext4_super_block *es); 64 struct ext4_super_block *es);
60static void ext4_clear_journal_err(struct super_block *sb, 65static void ext4_clear_journal_err(struct super_block *sb,
@@ -74,7 +79,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
74{ 79{
75 return le32_to_cpu(bg->bg_block_bitmap_lo) | 80 return le32_to_cpu(bg->bg_block_bitmap_lo) |
76 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 81 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
77 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); 82 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
78} 83}
79 84
80ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, 85ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -82,7 +87,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
82{ 87{
83 return le32_to_cpu(bg->bg_inode_bitmap_lo) | 88 return le32_to_cpu(bg->bg_inode_bitmap_lo) |
84 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 89 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
85 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); 90 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
86} 91}
87 92
88ext4_fsblk_t ext4_inode_table(struct super_block *sb, 93ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -90,7 +95,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
90{ 95{
91 return le32_to_cpu(bg->bg_inode_table_lo) | 96 return le32_to_cpu(bg->bg_inode_table_lo) |
92 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 97 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
93 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 98 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
94} 99}
95 100
96__u32 ext4_free_blks_count(struct super_block *sb, 101__u32 ext4_free_blks_count(struct super_block *sb,
@@ -98,7 +103,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
98{ 103{
99 return le16_to_cpu(bg->bg_free_blocks_count_lo) | 104 return le16_to_cpu(bg->bg_free_blocks_count_lo) |
100 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 105 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
101 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); 106 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
102} 107}
103 108
104__u32 ext4_free_inodes_count(struct super_block *sb, 109__u32 ext4_free_inodes_count(struct super_block *sb,
@@ -106,7 +111,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
106{ 111{
107 return le16_to_cpu(bg->bg_free_inodes_count_lo) | 112 return le16_to_cpu(bg->bg_free_inodes_count_lo) |
108 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 113 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
109 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); 114 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
110} 115}
111 116
112__u32 ext4_used_dirs_count(struct super_block *sb, 117__u32 ext4_used_dirs_count(struct super_block *sb,
@@ -114,7 +119,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
114{ 119{
115 return le16_to_cpu(bg->bg_used_dirs_count_lo) | 120 return le16_to_cpu(bg->bg_used_dirs_count_lo) |
116 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 121 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
117 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); 122 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
118} 123}
119 124
120__u32 ext4_itable_unused_count(struct super_block *sb, 125__u32 ext4_itable_unused_count(struct super_block *sb,
@@ -122,7 +127,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
122{ 127{
123 return le16_to_cpu(bg->bg_itable_unused_lo) | 128 return le16_to_cpu(bg->bg_itable_unused_lo) |
124 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 129 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
125 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); 130 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
126} 131}
127 132
128void ext4_block_bitmap_set(struct super_block *sb, 133void ext4_block_bitmap_set(struct super_block *sb,
@@ -202,8 +207,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
202 journal = EXT4_SB(sb)->s_journal; 207 journal = EXT4_SB(sb)->s_journal;
203 if (journal) { 208 if (journal) {
204 if (is_journal_aborted(journal)) { 209 if (is_journal_aborted(journal)) {
205 ext4_abort(sb, __func__, 210 ext4_abort(sb, __func__, "Detected aborted journal");
206 "Detected aborted journal");
207 return ERR_PTR(-EROFS); 211 return ERR_PTR(-EROFS);
208 } 212 }
209 return jbd2_journal_start(journal, nblocks); 213 return jbd2_journal_start(journal, nblocks);
@@ -302,10 +306,10 @@ static void ext4_handle_error(struct super_block *sb)
302 jbd2_journal_abort(journal, -EIO); 306 jbd2_journal_abort(journal, -EIO);
303 } 307 }
304 if (test_opt(sb, ERRORS_RO)) { 308 if (test_opt(sb, ERRORS_RO)) {
305 printk(KERN_CRIT "Remounting filesystem read-only\n"); 309 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
306 sb->s_flags |= MS_RDONLY; 310 sb->s_flags |= MS_RDONLY;
307 } 311 }
308 ext4_commit_super(sb, es, 1); 312 ext4_commit_super(sb, 1);
309 if (test_opt(sb, ERRORS_PANIC)) 313 if (test_opt(sb, ERRORS_PANIC))
310 panic("EXT4-fs (device %s): panic forced after error\n", 314 panic("EXT4-fs (device %s): panic forced after error\n",
311 sb->s_id); 315 sb->s_id);
@@ -395,8 +399,6 @@ void ext4_abort(struct super_block *sb, const char *function,
395{ 399{
396 va_list args; 400 va_list args;
397 401
398 printk(KERN_CRIT "ext4_abort called.\n");
399
400 va_start(args, fmt); 402 va_start(args, fmt);
401 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 403 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
402 vprintk(fmt, args); 404 vprintk(fmt, args);
@@ -409,7 +411,7 @@ void ext4_abort(struct super_block *sb, const char *function,
409 if (sb->s_flags & MS_RDONLY) 411 if (sb->s_flags & MS_RDONLY)
410 return; 412 return;
411 413
412 printk(KERN_CRIT "Remounting filesystem read-only\n"); 414 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
413 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 415 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
414 sb->s_flags |= MS_RDONLY; 416 sb->s_flags |= MS_RDONLY;
415 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; 417 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
@@ -417,6 +419,18 @@ void ext4_abort(struct super_block *sb, const char *function,
417 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 419 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
418} 420}
419 421
422void ext4_msg (struct super_block * sb, const char *prefix,
423 const char *fmt, ...)
424{
425 va_list args;
426
427 va_start(args, fmt);
428 printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
429 vprintk(fmt, args);
430 printk("\n");
431 va_end(args);
432}
433
420void ext4_warning(struct super_block *sb, const char *function, 434void ext4_warning(struct super_block *sb, const char *function,
421 const char *fmt, ...) 435 const char *fmt, ...)
422{ 436{
@@ -431,7 +445,7 @@ void ext4_warning(struct super_block *sb, const char *function,
431} 445}
432 446
433void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, 447void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
434 const char *function, const char *fmt, ...) 448 const char *function, const char *fmt, ...)
435__releases(bitlock) 449__releases(bitlock)
436__acquires(bitlock) 450__acquires(bitlock)
437{ 451{
@@ -447,7 +461,7 @@ __acquires(bitlock)
447 if (test_opt(sb, ERRORS_CONT)) { 461 if (test_opt(sb, ERRORS_CONT)) {
448 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 462 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
449 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 463 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
450 ext4_commit_super(sb, es, 0); 464 ext4_commit_super(sb, 0);
451 return; 465 return;
452 } 466 }
453 ext4_unlock_group(sb, grp); 467 ext4_unlock_group(sb, grp);
@@ -467,7 +481,6 @@ __acquires(bitlock)
467 return; 481 return;
468} 482}
469 483
470
471void ext4_update_dynamic_rev(struct super_block *sb) 484void ext4_update_dynamic_rev(struct super_block *sb)
472{ 485{
473 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 486 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -496,7 +509,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
496/* 509/*
497 * Open the external journal device 510 * Open the external journal device
498 */ 511 */
499static struct block_device *ext4_blkdev_get(dev_t dev) 512static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
500{ 513{
501 struct block_device *bdev; 514 struct block_device *bdev;
502 char b[BDEVNAME_SIZE]; 515 char b[BDEVNAME_SIZE];
@@ -507,7 +520,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
507 return bdev; 520 return bdev;
508 521
509fail: 522fail:
510 printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n", 523 ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
511 __bdevname(dev, b), PTR_ERR(bdev)); 524 __bdevname(dev, b), PTR_ERR(bdev));
512 return NULL; 525 return NULL;
513} 526}
@@ -543,8 +556,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
543{ 556{
544 struct list_head *l; 557 struct list_head *l;
545 558
546 printk(KERN_ERR "sb orphan head is %d\n", 559 ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
547 le32_to_cpu(sbi->s_es->s_last_orphan)); 560 le32_to_cpu(sbi->s_es->s_last_orphan));
548 561
549 printk(KERN_ERR "sb_info orphan list:\n"); 562 printk(KERN_ERR "sb_info orphan list:\n");
550 list_for_each(l, &sbi->s_orphan) { 563 list_for_each(l, &sbi->s_orphan) {
@@ -563,6 +576,12 @@ static void ext4_put_super(struct super_block *sb)
563 struct ext4_super_block *es = sbi->s_es; 576 struct ext4_super_block *es = sbi->s_es;
564 int i, err; 577 int i, err;
565 578
579 lock_super(sb);
580 lock_kernel();
581 if (sb->s_dirt)
582 ext4_commit_super(sb, 1);
583
584 ext4_release_system_zone(sb);
566 ext4_mb_release(sb); 585 ext4_mb_release(sb);
567 ext4_ext_release(sb); 586 ext4_ext_release(sb);
568 ext4_xattr_put_super(sb); 587 ext4_xattr_put_super(sb);
@@ -576,7 +595,7 @@ static void ext4_put_super(struct super_block *sb)
576 if (!(sb->s_flags & MS_RDONLY)) { 595 if (!(sb->s_flags & MS_RDONLY)) {
577 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 596 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
578 es->s_state = cpu_to_le16(sbi->s_mount_state); 597 es->s_state = cpu_to_le16(sbi->s_mount_state);
579 ext4_commit_super(sb, es, 1); 598 ext4_commit_super(sb, 1);
580 } 599 }
581 if (sbi->s_proc) { 600 if (sbi->s_proc) {
582 remove_proc_entry(sb->s_id, ext4_proc_root); 601 remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -586,7 +605,10 @@ static void ext4_put_super(struct super_block *sb)
586 for (i = 0; i < sbi->s_gdb_count; i++) 605 for (i = 0; i < sbi->s_gdb_count; i++)
587 brelse(sbi->s_group_desc[i]); 606 brelse(sbi->s_group_desc[i]);
588 kfree(sbi->s_group_desc); 607 kfree(sbi->s_group_desc);
589 kfree(sbi->s_flex_groups); 608 if (is_vmalloc_addr(sbi->s_flex_groups))
609 vfree(sbi->s_flex_groups);
610 else
611 kfree(sbi->s_flex_groups);
590 percpu_counter_destroy(&sbi->s_freeblocks_counter); 612 percpu_counter_destroy(&sbi->s_freeblocks_counter);
591 percpu_counter_destroy(&sbi->s_freeinodes_counter); 613 percpu_counter_destroy(&sbi->s_freeinodes_counter);
592 percpu_counter_destroy(&sbi->s_dirs_counter); 614 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -625,11 +647,8 @@ static void ext4_put_super(struct super_block *sb)
625 unlock_super(sb); 647 unlock_super(sb);
626 kobject_put(&sbi->s_kobj); 648 kobject_put(&sbi->s_kobj);
627 wait_for_completion(&sbi->s_kobj_unregister); 649 wait_for_completion(&sbi->s_kobj_unregister);
628 lock_super(sb);
629 lock_kernel();
630 kfree(sbi->s_blockgroup_lock); 650 kfree(sbi->s_blockgroup_lock);
631 kfree(sbi); 651 kfree(sbi);
632 return;
633} 652}
634 653
635static struct kmem_cache *ext4_inode_cachep; 654static struct kmem_cache *ext4_inode_cachep;
@@ -644,6 +663,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
644 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); 663 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
645 if (!ei) 664 if (!ei)
646 return NULL; 665 return NULL;
666
647#ifdef CONFIG_EXT4_FS_POSIX_ACL 667#ifdef CONFIG_EXT4_FS_POSIX_ACL
648 ei->i_acl = EXT4_ACL_NOT_CACHED; 668 ei->i_acl = EXT4_ACL_NOT_CACHED;
649 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 669 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -664,14 +684,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
664 ei->i_allocated_meta_blocks = 0; 684 ei->i_allocated_meta_blocks = 0;
665 ei->i_delalloc_reserved_flag = 0; 685 ei->i_delalloc_reserved_flag = 0;
666 spin_lock_init(&(ei->i_block_reservation_lock)); 686 spin_lock_init(&(ei->i_block_reservation_lock));
687
667 return &ei->vfs_inode; 688 return &ei->vfs_inode;
668} 689}
669 690
670static void ext4_destroy_inode(struct inode *inode) 691static void ext4_destroy_inode(struct inode *inode)
671{ 692{
672 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 693 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
673 printk("EXT4 Inode %p: orphan list check failed!\n", 694 ext4_msg(inode->i_sb, KERN_ERR,
674 EXT4_I(inode)); 695 "Inode %lu (%p): orphan list check failed!",
696 inode->i_ino, EXT4_I(inode));
675 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, 697 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
676 EXT4_I(inode), sizeof(struct ext4_inode_info), 698 EXT4_I(inode), sizeof(struct ext4_inode_info),
677 true); 699 true);
@@ -870,12 +892,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
870 seq_puts(seq, ",noauto_da_alloc"); 892 seq_puts(seq, ",noauto_da_alloc");
871 893
872 ext4_show_quota_options(seq, sb); 894 ext4_show_quota_options(seq, sb);
895
873 return 0; 896 return 0;
874} 897}
875 898
876
877static struct inode *ext4_nfs_get_inode(struct super_block *sb, 899static struct inode *ext4_nfs_get_inode(struct super_block *sb,
878 u64 ino, u32 generation) 900 u64 ino, u32 generation)
879{ 901{
880 struct inode *inode; 902 struct inode *inode;
881 903
@@ -904,14 +926,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
904} 926}
905 927
906static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, 928static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
907 int fh_len, int fh_type) 929 int fh_len, int fh_type)
908{ 930{
909 return generic_fh_to_dentry(sb, fid, fh_len, fh_type, 931 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
910 ext4_nfs_get_inode); 932 ext4_nfs_get_inode);
911} 933}
912 934
913static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, 935static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
914 int fh_len, int fh_type) 936 int fh_len, int fh_type)
915{ 937{
916 return generic_fh_to_parent(sb, fid, fh_len, fh_type, 938 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
917 ext4_nfs_get_inode); 939 ext4_nfs_get_inode);
@@ -923,7 +945,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
923 * which would prevent try_to_free_buffers() from freeing them, we must use 945 * which would prevent try_to_free_buffers() from freeing them, we must use
924 * jbd2 layer's try_to_free_buffers() function to release them. 946 * jbd2 layer's try_to_free_buffers() function to release them.
925 */ 947 */
926static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait) 948static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
949 gfp_t wait)
927{ 950{
928 journal_t *journal = EXT4_SB(sb)->s_journal; 951 journal_t *journal = EXT4_SB(sb)->s_journal;
929 952
@@ -992,7 +1015,6 @@ static const struct super_operations ext4_sops = {
992 .dirty_inode = ext4_dirty_inode, 1015 .dirty_inode = ext4_dirty_inode,
993 .delete_inode = ext4_delete_inode, 1016 .delete_inode = ext4_delete_inode,
994 .put_super = ext4_put_super, 1017 .put_super = ext4_put_super,
995 .write_super = ext4_write_super,
996 .sync_fs = ext4_sync_fs, 1018 .sync_fs = ext4_sync_fs,
997 .freeze_fs = ext4_freeze, 1019 .freeze_fs = ext4_freeze,
998 .unfreeze_fs = ext4_unfreeze, 1020 .unfreeze_fs = ext4_unfreeze,
@@ -1007,6 +1029,25 @@ static const struct super_operations ext4_sops = {
1007 .bdev_try_to_free_page = bdev_try_to_free_page, 1029 .bdev_try_to_free_page = bdev_try_to_free_page,
1008}; 1030};
1009 1031
1032static const struct super_operations ext4_nojournal_sops = {
1033 .alloc_inode = ext4_alloc_inode,
1034 .destroy_inode = ext4_destroy_inode,
1035 .write_inode = ext4_write_inode,
1036 .dirty_inode = ext4_dirty_inode,
1037 .delete_inode = ext4_delete_inode,
1038 .write_super = ext4_write_super,
1039 .put_super = ext4_put_super,
1040 .statfs = ext4_statfs,
1041 .remount_fs = ext4_remount,
1042 .clear_inode = ext4_clear_inode,
1043 .show_options = ext4_show_options,
1044#ifdef CONFIG_QUOTA
1045 .quota_read = ext4_quota_read,
1046 .quota_write = ext4_quota_write,
1047#endif
1048 .bdev_try_to_free_page = bdev_try_to_free_page,
1049};
1050
1010static const struct export_operations ext4_export_ops = { 1051static const struct export_operations ext4_export_ops = {
1011 .fh_to_dentry = ext4_fh_to_dentry, 1052 .fh_to_dentry = ext4_fh_to_dentry,
1012 .fh_to_parent = ext4_fh_to_parent, 1053 .fh_to_parent = ext4_fh_to_parent,
@@ -1023,12 +1064,13 @@ enum {
1023 Opt_journal_update, Opt_journal_dev, 1064 Opt_journal_update, Opt_journal_dev,
1024 Opt_journal_checksum, Opt_journal_async_commit, 1065 Opt_journal_checksum, Opt_journal_async_commit,
1025 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1066 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1026 Opt_data_err_abort, Opt_data_err_ignore, 1067 Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
1027 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1068 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1028 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1069 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1029 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1070 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
1030 Opt_usrquota, Opt_grpquota, Opt_i_version, 1071 Opt_usrquota, Opt_grpquota, Opt_i_version,
1031 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1072 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1073 Opt_block_validity, Opt_noblock_validity,
1032 Opt_inode_readahead_blks, Opt_journal_ioprio 1074 Opt_inode_readahead_blks, Opt_journal_ioprio
1033}; 1075};
1034 1076
@@ -1069,6 +1111,7 @@ static const match_table_t tokens = {
1069 {Opt_data_writeback, "data=writeback"}, 1111 {Opt_data_writeback, "data=writeback"},
1070 {Opt_data_err_abort, "data_err=abort"}, 1112 {Opt_data_err_abort, "data_err=abort"},
1071 {Opt_data_err_ignore, "data_err=ignore"}, 1113 {Opt_data_err_ignore, "data_err=ignore"},
1114 {Opt_mb_history_length, "mb_history_length=%u"},
1072 {Opt_offusrjquota, "usrjquota="}, 1115 {Opt_offusrjquota, "usrjquota="},
1073 {Opt_usrjquota, "usrjquota=%s"}, 1116 {Opt_usrjquota, "usrjquota=%s"},
1074 {Opt_offgrpjquota, "grpjquota="}, 1117 {Opt_offgrpjquota, "grpjquota="},
@@ -1087,6 +1130,8 @@ static const match_table_t tokens = {
1087 {Opt_resize, "resize"}, 1130 {Opt_resize, "resize"},
1088 {Opt_delalloc, "delalloc"}, 1131 {Opt_delalloc, "delalloc"},
1089 {Opt_nodelalloc, "nodelalloc"}, 1132 {Opt_nodelalloc, "nodelalloc"},
1133 {Opt_block_validity, "block_validity"},
1134 {Opt_noblock_validity, "noblock_validity"},
1090 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1135 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1091 {Opt_journal_ioprio, "journal_ioprio=%u"}, 1136 {Opt_journal_ioprio, "journal_ioprio=%u"},
1092 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1137 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1102,8 +1147,9 @@ static ext4_fsblk_t get_sb_block(void **data)
1102 1147
1103 if (!options || strncmp(options, "sb=", 3) != 0) 1148 if (!options || strncmp(options, "sb=", 3) != 0)
1104 return 1; /* Default location */ 1149 return 1; /* Default location */
1150
1105 options += 3; 1151 options += 3;
1106 /*todo: use simple_strtoll with >32bit ext4 */ 1152 /* TODO: use simple_strtoll with >32bit ext4 */
1107 sb_block = simple_strtoul(options, &options, 0); 1153 sb_block = simple_strtoul(options, &options, 0);
1108 if (*options && *options != ',') { 1154 if (*options && *options != ',') {
1109 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", 1155 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1113,6 +1159,7 @@ static ext4_fsblk_t get_sb_block(void **data)
1113 if (*options == ',') 1159 if (*options == ',')
1114 options++; 1160 options++;
1115 *data = (void *) options; 1161 *data = (void *) options;
1162
1116 return sb_block; 1163 return sb_block;
1117} 1164}
1118 1165
@@ -1206,8 +1253,7 @@ static int parse_options(char *options, struct super_block *sb,
1206#else 1253#else
1207 case Opt_user_xattr: 1254 case Opt_user_xattr:
1208 case Opt_nouser_xattr: 1255 case Opt_nouser_xattr:
1209 printk(KERN_ERR "EXT4 (no)user_xattr options " 1256 ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
1210 "not supported\n");
1211 break; 1257 break;
1212#endif 1258#endif
1213#ifdef CONFIG_EXT4_FS_POSIX_ACL 1259#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1220,8 +1266,7 @@ static int parse_options(char *options, struct super_block *sb,
1220#else 1266#else
1221 case Opt_acl: 1267 case Opt_acl:
1222 case Opt_noacl: 1268 case Opt_noacl:
1223 printk(KERN_ERR "EXT4 (no)acl options " 1269 ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
1224 "not supported\n");
1225 break; 1270 break;
1226#endif 1271#endif
1227 case Opt_journal_update: 1272 case Opt_journal_update:
@@ -1231,16 +1276,16 @@ static int parse_options(char *options, struct super_block *sb,
1231 user to specify an existing inode to be the 1276 user to specify an existing inode to be the
1232 journal file. */ 1277 journal file. */
1233 if (is_remount) { 1278 if (is_remount) {
1234 printk(KERN_ERR "EXT4-fs: cannot specify " 1279 ext4_msg(sb, KERN_ERR,
1235 "journal on remount\n"); 1280 "Cannot specify journal on remount");
1236 return 0; 1281 return 0;
1237 } 1282 }
1238 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1283 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
1239 break; 1284 break;
1240 case Opt_journal_dev: 1285 case Opt_journal_dev:
1241 if (is_remount) { 1286 if (is_remount) {
1242 printk(KERN_ERR "EXT4-fs: cannot specify " 1287 ext4_msg(sb, KERN_ERR,
1243 "journal on remount\n"); 1288 "Cannot specify journal on remount");
1244 return 0; 1289 return 0;
1245 } 1290 }
1246 if (match_int(&args[0], &option)) 1291 if (match_int(&args[0], &option))
@@ -1294,9 +1339,8 @@ static int parse_options(char *options, struct super_block *sb,
1294 if (is_remount) { 1339 if (is_remount) {
1295 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) 1340 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
1296 != data_opt) { 1341 != data_opt) {
1297 printk(KERN_ERR 1342 ext4_msg(sb, KERN_ERR,
1298 "EXT4-fs: cannot change data " 1343 "Cannot change data mode on remount");
1299 "mode on remount\n");
1300 return 0; 1344 return 0;
1301 } 1345 }
1302 } else { 1346 } else {
@@ -1310,6 +1354,13 @@ static int parse_options(char *options, struct super_block *sb,
1310 case Opt_data_err_ignore: 1354 case Opt_data_err_ignore:
1311 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1355 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1312 break; 1356 break;
1357 case Opt_mb_history_length:
1358 if (match_int(&args[0], &option))
1359 return 0;
1360 if (option < 0)
1361 return 0;
1362 sbi->s_mb_history_max = option;
1363 break;
1313#ifdef CONFIG_QUOTA 1364#ifdef CONFIG_QUOTA
1314 case Opt_usrjquota: 1365 case Opt_usrjquota:
1315 qtype = USRQUOTA; 1366 qtype = USRQUOTA;
@@ -1319,31 +1370,31 @@ static int parse_options(char *options, struct super_block *sb,
1319set_qf_name: 1370set_qf_name:
1320 if (sb_any_quota_loaded(sb) && 1371 if (sb_any_quota_loaded(sb) &&
1321 !sbi->s_qf_names[qtype]) { 1372 !sbi->s_qf_names[qtype]) {
1322 printk(KERN_ERR 1373 ext4_msg(sb, KERN_ERR,
1323 "EXT4-fs: Cannot change journaled " 1374 "Cannot change journaled "
1324 "quota options when quota turned on.\n"); 1375 "quota options when quota turned on");
1325 return 0; 1376 return 0;
1326 } 1377 }
1327 qname = match_strdup(&args[0]); 1378 qname = match_strdup(&args[0]);
1328 if (!qname) { 1379 if (!qname) {
1329 printk(KERN_ERR 1380 ext4_msg(sb, KERN_ERR,
1330 "EXT4-fs: not enough memory for " 1381 "Not enough memory for "
1331 "storing quotafile name.\n"); 1382 "storing quotafile name");
1332 return 0; 1383 return 0;
1333 } 1384 }
1334 if (sbi->s_qf_names[qtype] && 1385 if (sbi->s_qf_names[qtype] &&
1335 strcmp(sbi->s_qf_names[qtype], qname)) { 1386 strcmp(sbi->s_qf_names[qtype], qname)) {
1336 printk(KERN_ERR 1387 ext4_msg(sb, KERN_ERR,
1337 "EXT4-fs: %s quota file already " 1388 "%s quota file already "
1338 "specified.\n", QTYPE2NAME(qtype)); 1389 "specified", QTYPE2NAME(qtype));
1339 kfree(qname); 1390 kfree(qname);
1340 return 0; 1391 return 0;
1341 } 1392 }
1342 sbi->s_qf_names[qtype] = qname; 1393 sbi->s_qf_names[qtype] = qname;
1343 if (strchr(sbi->s_qf_names[qtype], '/')) { 1394 if (strchr(sbi->s_qf_names[qtype], '/')) {
1344 printk(KERN_ERR 1395 ext4_msg(sb, KERN_ERR,
1345 "EXT4-fs: quotafile must be on " 1396 "quotafile must be on "
1346 "filesystem root.\n"); 1397 "filesystem root");
1347 kfree(sbi->s_qf_names[qtype]); 1398 kfree(sbi->s_qf_names[qtype]);
1348 sbi->s_qf_names[qtype] = NULL; 1399 sbi->s_qf_names[qtype] = NULL;
1349 return 0; 1400 return 0;
@@ -1358,9 +1409,9 @@ set_qf_name:
1358clear_qf_name: 1409clear_qf_name:
1359 if (sb_any_quota_loaded(sb) && 1410 if (sb_any_quota_loaded(sb) &&
1360 sbi->s_qf_names[qtype]) { 1411 sbi->s_qf_names[qtype]) {
1361 printk(KERN_ERR "EXT4-fs: Cannot change " 1412 ext4_msg(sb, KERN_ERR, "Cannot change "
1362 "journaled quota options when " 1413 "journaled quota options when "
1363 "quota turned on.\n"); 1414 "quota turned on");
1364 return 0; 1415 return 0;
1365 } 1416 }
1366 /* 1417 /*
@@ -1377,9 +1428,9 @@ clear_qf_name:
1377set_qf_format: 1428set_qf_format:
1378 if (sb_any_quota_loaded(sb) && 1429 if (sb_any_quota_loaded(sb) &&
1379 sbi->s_jquota_fmt != qfmt) { 1430 sbi->s_jquota_fmt != qfmt) {
1380 printk(KERN_ERR "EXT4-fs: Cannot change " 1431 ext4_msg(sb, KERN_ERR, "Cannot change "
1381 "journaled quota options when " 1432 "journaled quota options when "
1382 "quota turned on.\n"); 1433 "quota turned on");
1383 return 0; 1434 return 0;
1384 } 1435 }
1385 sbi->s_jquota_fmt = qfmt; 1436 sbi->s_jquota_fmt = qfmt;
@@ -1395,8 +1446,8 @@ set_qf_format:
1395 break; 1446 break;
1396 case Opt_noquota: 1447 case Opt_noquota:
1397 if (sb_any_quota_loaded(sb)) { 1448 if (sb_any_quota_loaded(sb)) {
1398 printk(KERN_ERR "EXT4-fs: Cannot change quota " 1449 ext4_msg(sb, KERN_ERR, "Cannot change quota "
1399 "options when quota turned on.\n"); 1450 "options when quota turned on");
1400 return 0; 1451 return 0;
1401 } 1452 }
1402 clear_opt(sbi->s_mount_opt, QUOTA); 1453 clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1407,8 +1458,8 @@ set_qf_format:
1407 case Opt_quota: 1458 case Opt_quota:
1408 case Opt_usrquota: 1459 case Opt_usrquota:
1409 case Opt_grpquota: 1460 case Opt_grpquota:
1410 printk(KERN_ERR 1461 ext4_msg(sb, KERN_ERR,
1411 "EXT4-fs: quota options not supported.\n"); 1462 "quota options not supported");
1412 break; 1463 break;
1413 case Opt_usrjquota: 1464 case Opt_usrjquota:
1414 case Opt_grpjquota: 1465 case Opt_grpjquota:
@@ -1416,9 +1467,8 @@ set_qf_format:
1416 case Opt_offgrpjquota: 1467 case Opt_offgrpjquota:
1417 case Opt_jqfmt_vfsold: 1468 case Opt_jqfmt_vfsold:
1418 case Opt_jqfmt_vfsv0: 1469 case Opt_jqfmt_vfsv0:
1419 printk(KERN_ERR 1470 ext4_msg(sb, KERN_ERR,
1420 "EXT4-fs: journaled quota options not " 1471 "journaled quota options not supported");
1421 "supported.\n");
1422 break; 1472 break;
1423 case Opt_noquota: 1473 case Opt_noquota:
1424 break; 1474 break;
@@ -1443,8 +1493,9 @@ set_qf_format:
1443 break; 1493 break;
1444 case Opt_resize: 1494 case Opt_resize:
1445 if (!is_remount) { 1495 if (!is_remount) {
1446 printk("EXT4-fs: resize option only available " 1496 ext4_msg(sb, KERN_ERR,
1447 "for remount\n"); 1497 "resize option only available "
1498 "for remount");
1448 return 0; 1499 return 0;
1449 } 1500 }
1450 if (match_int(&args[0], &option) != 0) 1501 if (match_int(&args[0], &option) != 0)
@@ -1474,14 +1525,21 @@ set_qf_format:
1474 case Opt_delalloc: 1525 case Opt_delalloc:
1475 set_opt(sbi->s_mount_opt, DELALLOC); 1526 set_opt(sbi->s_mount_opt, DELALLOC);
1476 break; 1527 break;
1528 case Opt_block_validity:
1529 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
1530 break;
1531 case Opt_noblock_validity:
1532 clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
1533 break;
1477 case Opt_inode_readahead_blks: 1534 case Opt_inode_readahead_blks:
1478 if (match_int(&args[0], &option)) 1535 if (match_int(&args[0], &option))
1479 return 0; 1536 return 0;
1480 if (option < 0 || option > (1 << 30)) 1537 if (option < 0 || option > (1 << 30))
1481 return 0; 1538 return 0;
1482 if (option & (option - 1)) { 1539 if (!is_power_of_2(option)) {
1483 printk(KERN_ERR "EXT4-fs: inode_readahead_blks" 1540 ext4_msg(sb, KERN_ERR,
1484 " must be a power of 2\n"); 1541 "EXT4-fs: inode_readahead_blks"
1542 " must be a power of 2");
1485 return 0; 1543 return 0;
1486 } 1544 }
1487 sbi->s_inode_readahead_blks = option; 1545 sbi->s_inode_readahead_blks = option;
@@ -1508,9 +1566,9 @@ set_qf_format:
1508 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1566 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1509 break; 1567 break;
1510 default: 1568 default:
1511 printk(KERN_ERR 1569 ext4_msg(sb, KERN_ERR,
1512 "EXT4-fs: Unrecognized mount option \"%s\" " 1570 "Unrecognized mount option \"%s\" "
1513 "or missing value\n", p); 1571 "or missing value", p);
1514 return 0; 1572 return 0;
1515 } 1573 }
1516 } 1574 }
@@ -1528,21 +1586,21 @@ set_qf_format:
1528 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) || 1586 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1529 (sbi->s_qf_names[GRPQUOTA] && 1587 (sbi->s_qf_names[GRPQUOTA] &&
1530 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) { 1588 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1531 printk(KERN_ERR "EXT4-fs: old and new quota " 1589 ext4_msg(sb, KERN_ERR, "old and new quota "
1532 "format mixing.\n"); 1590 "format mixing");
1533 return 0; 1591 return 0;
1534 } 1592 }
1535 1593
1536 if (!sbi->s_jquota_fmt) { 1594 if (!sbi->s_jquota_fmt) {
1537 printk(KERN_ERR "EXT4-fs: journaled quota format " 1595 ext4_msg(sb, KERN_ERR, "journaled quota format "
1538 "not specified.\n"); 1596 "not specified");
1539 return 0; 1597 return 0;
1540 } 1598 }
1541 } else { 1599 } else {
1542 if (sbi->s_jquota_fmt) { 1600 if (sbi->s_jquota_fmt) {
1543 printk(KERN_ERR "EXT4-fs: journaled quota format " 1601 ext4_msg(sb, KERN_ERR, "journaled quota format "
1544 "specified with no journaling " 1602 "specified with no journaling "
1545 "enabled.\n"); 1603 "enabled");
1546 return 0; 1604 return 0;
1547 } 1605 }
1548 } 1606 }
@@ -1557,32 +1615,32 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1557 int res = 0; 1615 int res = 0;
1558 1616
1559 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { 1617 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
1560 printk(KERN_ERR "EXT4-fs warning: revision level too high, " 1618 ext4_msg(sb, KERN_ERR, "revision level too high, "
1561 "forcing read-only mode\n"); 1619 "forcing read-only mode");
1562 res = MS_RDONLY; 1620 res = MS_RDONLY;
1563 } 1621 }
1564 if (read_only) 1622 if (read_only)
1565 return res; 1623 return res;
1566 if (!(sbi->s_mount_state & EXT4_VALID_FS)) 1624 if (!(sbi->s_mount_state & EXT4_VALID_FS))
1567 printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " 1625 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
1568 "running e2fsck is recommended\n"); 1626 "running e2fsck is recommended");
1569 else if ((sbi->s_mount_state & EXT4_ERROR_FS)) 1627 else if ((sbi->s_mount_state & EXT4_ERROR_FS))
1570 printk(KERN_WARNING 1628 ext4_msg(sb, KERN_WARNING,
1571 "EXT4-fs warning: mounting fs with errors, " 1629 "warning: mounting fs with errors, "
1572 "running e2fsck is recommended\n"); 1630 "running e2fsck is recommended");
1573 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1631 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
1574 le16_to_cpu(es->s_mnt_count) >= 1632 le16_to_cpu(es->s_mnt_count) >=
1575 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1633 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1576 printk(KERN_WARNING 1634 ext4_msg(sb, KERN_WARNING,
1577 "EXT4-fs warning: maximal mount count reached, " 1635 "warning: maximal mount count reached, "
1578 "running e2fsck is recommended\n"); 1636 "running e2fsck is recommended");
1579 else if (le32_to_cpu(es->s_checkinterval) && 1637 else if (le32_to_cpu(es->s_checkinterval) &&
1580 (le32_to_cpu(es->s_lastcheck) + 1638 (le32_to_cpu(es->s_lastcheck) +
1581 le32_to_cpu(es->s_checkinterval) <= get_seconds())) 1639 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1582 printk(KERN_WARNING 1640 ext4_msg(sb, KERN_WARNING,
1583 "EXT4-fs warning: checktime reached, " 1641 "warning: checktime reached, "
1584 "running e2fsck is recommended\n"); 1642 "running e2fsck is recommended");
1585 if (!sbi->s_journal) 1643 if (!sbi->s_journal)
1586 es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 1644 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1587 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 1645 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1588 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 1646 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1592,7 +1650,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1592 if (sbi->s_journal) 1650 if (sbi->s_journal)
1593 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 1651 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1594 1652
1595 ext4_commit_super(sb, es, 1); 1653 ext4_commit_super(sb, 1);
1596 if (test_opt(sb, DEBUG)) 1654 if (test_opt(sb, DEBUG))
1597 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 1655 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1598 "bpg=%lu, ipg=%lu, mo=%04lx]\n", 1656 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
@@ -1603,11 +1661,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1603 sbi->s_mount_opt); 1661 sbi->s_mount_opt);
1604 1662
1605 if (EXT4_SB(sb)->s_journal) { 1663 if (EXT4_SB(sb)->s_journal) {
1606 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", 1664 ext4_msg(sb, KERN_INFO, "%s journal on %s",
1607 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : 1665 EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1608 "external", EXT4_SB(sb)->s_journal->j_devname); 1666 "external", EXT4_SB(sb)->s_journal->j_devname);
1609 } else { 1667 } else {
1610 printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id); 1668 ext4_msg(sb, KERN_INFO, "no journal");
1611 } 1669 }
1612 return res; 1670 return res;
1613} 1671}
@@ -1616,10 +1674,10 @@ static int ext4_fill_flex_info(struct super_block *sb)
1616{ 1674{
1617 struct ext4_sb_info *sbi = EXT4_SB(sb); 1675 struct ext4_sb_info *sbi = EXT4_SB(sb);
1618 struct ext4_group_desc *gdp = NULL; 1676 struct ext4_group_desc *gdp = NULL;
1619 struct buffer_head *bh;
1620 ext4_group_t flex_group_count; 1677 ext4_group_t flex_group_count;
1621 ext4_group_t flex_group; 1678 ext4_group_t flex_group;
1622 int groups_per_flex = 0; 1679 int groups_per_flex = 0;
1680 size_t size;
1623 int i; 1681 int i;
1624 1682
1625 if (!sbi->s_es->s_log_groups_per_flex) { 1683 if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,16 +1692,21 @@ static int ext4_fill_flex_info(struct super_block *sb)
1634 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + 1692 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
1635 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << 1693 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
1636 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; 1694 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
1637 sbi->s_flex_groups = kzalloc(flex_group_count * 1695 size = flex_group_count * sizeof(struct flex_groups);
1638 sizeof(struct flex_groups), GFP_KERNEL); 1696 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
1697 if (sbi->s_flex_groups == NULL) {
1698 sbi->s_flex_groups = vmalloc(size);
1699 if (sbi->s_flex_groups)
1700 memset(sbi->s_flex_groups, 0, size);
1701 }
1639 if (sbi->s_flex_groups == NULL) { 1702 if (sbi->s_flex_groups == NULL) {
1640 printk(KERN_ERR "EXT4-fs: not enough memory for " 1703 ext4_msg(sb, KERN_ERR, "not enough memory for "
1641 "%u flex groups\n", flex_group_count); 1704 "%u flex groups", flex_group_count);
1642 goto failed; 1705 goto failed;
1643 } 1706 }
1644 1707
1645 for (i = 0; i < sbi->s_groups_count; i++) { 1708 for (i = 0; i < sbi->s_groups_count; i++) {
1646 gdp = ext4_get_group_desc(sb, i, &bh); 1709 gdp = ext4_get_group_desc(sb, i, NULL);
1647 1710
1648 flex_group = ext4_flex_group(sbi, i); 1711 flex_group = ext4_flex_group(sbi, i);
1649 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, 1712 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
@@ -1724,44 +1787,44 @@ static int ext4_check_descriptors(struct super_block *sb)
1724 1787
1725 block_bitmap = ext4_block_bitmap(sb, gdp); 1788 block_bitmap = ext4_block_bitmap(sb, gdp);
1726 if (block_bitmap < first_block || block_bitmap > last_block) { 1789 if (block_bitmap < first_block || block_bitmap > last_block) {
1727 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1790 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
1728 "Block bitmap for group %u not in group " 1791 "Block bitmap for group %u not in group "
1729 "(block %llu)!\n", i, block_bitmap); 1792 "(block %llu)!", i, block_bitmap);
1730 return 0; 1793 return 0;
1731 } 1794 }
1732 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1795 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1733 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1796 if (inode_bitmap < first_block || inode_bitmap > last_block) {
1734 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1797 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
1735 "Inode bitmap for group %u not in group " 1798 "Inode bitmap for group %u not in group "
1736 "(block %llu)!\n", i, inode_bitmap); 1799 "(block %llu)!", i, inode_bitmap);
1737 return 0; 1800 return 0;
1738 } 1801 }
1739 inode_table = ext4_inode_table(sb, gdp); 1802 inode_table = ext4_inode_table(sb, gdp);
1740 if (inode_table < first_block || 1803 if (inode_table < first_block ||
1741 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1804 inode_table + sbi->s_itb_per_group - 1 > last_block) {
1742 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1805 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
1743 "Inode table for group %u not in group " 1806 "Inode table for group %u not in group "
1744 "(block %llu)!\n", i, inode_table); 1807 "(block %llu)!", i, inode_table);
1745 return 0; 1808 return 0;
1746 } 1809 }
1747 spin_lock(sb_bgl_lock(sbi, i)); 1810 ext4_lock_group(sb, i);
1748 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { 1811 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1749 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1812 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
1750 "Checksum for group %u failed (%u!=%u)\n", 1813 "Checksum for group %u failed (%u!=%u)",
1751 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1814 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1752 gdp)), le16_to_cpu(gdp->bg_checksum)); 1815 gdp)), le16_to_cpu(gdp->bg_checksum));
1753 if (!(sb->s_flags & MS_RDONLY)) { 1816 if (!(sb->s_flags & MS_RDONLY)) {
1754 spin_unlock(sb_bgl_lock(sbi, i)); 1817 ext4_unlock_group(sb, i);
1755 return 0; 1818 return 0;
1756 } 1819 }
1757 } 1820 }
1758 spin_unlock(sb_bgl_lock(sbi, i)); 1821 ext4_unlock_group(sb, i);
1759 if (!flexbg_flag) 1822 if (!flexbg_flag)
1760 first_block += EXT4_BLOCKS_PER_GROUP(sb); 1823 first_block += EXT4_BLOCKS_PER_GROUP(sb);
1761 } 1824 }
1762 1825
1763 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); 1826 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
1764 sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 1827 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
1765 return 1; 1828 return 1;
1766} 1829}
1767 1830
@@ -1796,8 +1859,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1796 } 1859 }
1797 1860
1798 if (bdev_read_only(sb->s_bdev)) { 1861 if (bdev_read_only(sb->s_bdev)) {
1799 printk(KERN_ERR "EXT4-fs: write access " 1862 ext4_msg(sb, KERN_ERR, "write access "
1800 "unavailable, skipping orphan cleanup.\n"); 1863 "unavailable, skipping orphan cleanup");
1801 return; 1864 return;
1802 } 1865 }
1803 1866
@@ -1811,8 +1874,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1811 } 1874 }
1812 1875
1813 if (s_flags & MS_RDONLY) { 1876 if (s_flags & MS_RDONLY) {
1814 printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n", 1877 ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
1815 sb->s_id);
1816 sb->s_flags &= ~MS_RDONLY; 1878 sb->s_flags &= ~MS_RDONLY;
1817 } 1879 }
1818#ifdef CONFIG_QUOTA 1880#ifdef CONFIG_QUOTA
@@ -1823,9 +1885,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1823 if (EXT4_SB(sb)->s_qf_names[i]) { 1885 if (EXT4_SB(sb)->s_qf_names[i]) {
1824 int ret = ext4_quota_on_mount(sb, i); 1886 int ret = ext4_quota_on_mount(sb, i);
1825 if (ret < 0) 1887 if (ret < 0)
1826 printk(KERN_ERR 1888 ext4_msg(sb, KERN_ERR,
1827 "EXT4-fs: Cannot turn on journaled " 1889 "Cannot turn on journaled "
1828 "quota: error %d\n", ret); 1890 "quota: error %d", ret);
1829 } 1891 }
1830 } 1892 }
1831#endif 1893#endif
@@ -1842,16 +1904,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1842 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 1904 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1843 vfs_dq_init(inode); 1905 vfs_dq_init(inode);
1844 if (inode->i_nlink) { 1906 if (inode->i_nlink) {
1845 printk(KERN_DEBUG 1907 ext4_msg(sb, KERN_DEBUG,
1846 "%s: truncating inode %lu to %lld bytes\n", 1908 "%s: truncating inode %lu to %lld bytes",
1847 __func__, inode->i_ino, inode->i_size); 1909 __func__, inode->i_ino, inode->i_size);
1848 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 1910 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
1849 inode->i_ino, inode->i_size); 1911 inode->i_ino, inode->i_size);
1850 ext4_truncate(inode); 1912 ext4_truncate(inode);
1851 nr_truncates++; 1913 nr_truncates++;
1852 } else { 1914 } else {
1853 printk(KERN_DEBUG 1915 ext4_msg(sb, KERN_DEBUG,
1854 "%s: deleting unreferenced inode %lu\n", 1916 "%s: deleting unreferenced inode %lu",
1855 __func__, inode->i_ino); 1917 __func__, inode->i_ino);
1856 jbd_debug(2, "deleting unreferenced inode %lu\n", 1918 jbd_debug(2, "deleting unreferenced inode %lu\n",
1857 inode->i_ino); 1919 inode->i_ino);
@@ -1863,11 +1925,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1863#define PLURAL(x) (x), ((x) == 1) ? "" : "s" 1925#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
1864 1926
1865 if (nr_orphans) 1927 if (nr_orphans)
1866 printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n", 1928 ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
1867 sb->s_id, PLURAL(nr_orphans)); 1929 PLURAL(nr_orphans));
1868 if (nr_truncates) 1930 if (nr_truncates)
1869 printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n", 1931 ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
1870 sb->s_id, PLURAL(nr_truncates)); 1932 PLURAL(nr_truncates));
1871#ifdef CONFIG_QUOTA 1933#ifdef CONFIG_QUOTA
1872 /* Turn quotas off */ 1934 /* Turn quotas off */
1873 for (i = 0; i < MAXQUOTAS; i++) { 1935 for (i = 0; i < MAXQUOTAS; i++) {
@@ -1877,6 +1939,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1877#endif 1939#endif
1878 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 1940 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1879} 1941}
1942
1880/* 1943/*
1881 * Maximal extent format file size. 1944 * Maximal extent format file size.
1882 * Resulting logical blkno at s_maxbytes must fit in our on-disk 1945 * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1927,19 +1990,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
1927 loff_t res = EXT4_NDIR_BLOCKS; 1990 loff_t res = EXT4_NDIR_BLOCKS;
1928 int meta_blocks; 1991 int meta_blocks;
1929 loff_t upper_limit; 1992 loff_t upper_limit;
1930 /* This is calculated to be the largest file size for a 1993 /* This is calculated to be the largest file size for a dense, block
1931 * dense, bitmapped file such that the total number of 1994 * mapped file such that the file's total number of 512-byte sectors,
1932 * sectors in the file, including data and all indirect blocks, 1995 * including data and all indirect blocks, does not exceed (2^48 - 1).
1933 * does not exceed 2^48 -1 1996 *
1934 * __u32 i_blocks_lo and _u16 i_blocks_high representing the 1997 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
1935 * total number of 512 bytes blocks of the file 1998 * number of 512-byte sectors of the file.
1936 */ 1999 */
1937 2000
1938 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 2001 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
1939 /* 2002 /*
1940 * !has_huge_files or CONFIG_LBD is not enabled 2003 * !has_huge_files or CONFIG_LBD not enabled implies that
1941 * implies the inode i_block represent total blocks in 2004 * the inode i_block field represents total file blocks in
1942 * 512 bytes 32 == size of vfs inode i_blocks * 8 2005 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
1943 */ 2006 */
1944 upper_limit = (1LL << 32) - 1; 2007 upper_limit = (1LL << 32) - 1;
1945 2008
@@ -1981,7 +2044,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
1981} 2044}
1982 2045
1983static ext4_fsblk_t descriptor_loc(struct super_block *sb, 2046static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1984 ext4_fsblk_t logical_sb_block, int nr) 2047 ext4_fsblk_t logical_sb_block, int nr)
1985{ 2048{
1986 struct ext4_sb_info *sbi = EXT4_SB(sb); 2049 struct ext4_sb_info *sbi = EXT4_SB(sb);
1987 ext4_group_t bg, first_meta_bg; 2050 ext4_group_t bg, first_meta_bg;
@@ -1995,6 +2058,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1995 bg = sbi->s_desc_per_block * nr; 2058 bg = sbi->s_desc_per_block * nr;
1996 if (ext4_bg_has_super(sb, bg)) 2059 if (ext4_bg_has_super(sb, bg))
1997 has_super = 1; 2060 has_super = 1;
2061
1998 return (has_super + ext4_group_first_block_no(sb, bg)); 2062 return (has_super + ext4_group_first_block_no(sb, bg));
1999} 2063}
2000 2064
@@ -2091,8 +2155,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2091 if (parse_strtoul(buf, 0x40000000, &t)) 2155 if (parse_strtoul(buf, 0x40000000, &t))
2092 return -EINVAL; 2156 return -EINVAL;
2093 2157
2094 /* inode_readahead_blks must be a power of 2 */ 2158 if (!is_power_of_2(t))
2095 if (t & (t-1))
2096 return -EINVAL; 2159 return -EINVAL;
2097 2160
2098 sbi->s_inode_readahead_blks = t; 2161 sbi->s_inode_readahead_blks = t;
@@ -2100,7 +2163,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2100} 2163}
2101 2164
2102static ssize_t sbi_ui_show(struct ext4_attr *a, 2165static ssize_t sbi_ui_show(struct ext4_attr *a,
2103 struct ext4_sb_info *sbi, char *buf) 2166 struct ext4_sb_info *sbi, char *buf)
2104{ 2167{
2105 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2168 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2106 2169
@@ -2205,7 +2268,6 @@ static struct kobj_type ext4_ktype = {
2205static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2268static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2206 __releases(kernel_lock) 2269 __releases(kernel_lock)
2207 __acquires(kernel_lock) 2270 __acquires(kernel_lock)
2208
2209{ 2271{
2210 struct buffer_head *bh; 2272 struct buffer_head *bh;
2211 struct ext4_super_block *es = NULL; 2273 struct ext4_super_block *es = NULL;
@@ -2256,7 +2318,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2256 2318
2257 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 2319 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
2258 if (!blocksize) { 2320 if (!blocksize) {
2259 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); 2321 ext4_msg(sb, KERN_ERR, "unable to set blocksize");
2260 goto out_fail; 2322 goto out_fail;
2261 } 2323 }
2262 2324
@@ -2272,7 +2334,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2272 } 2334 }
2273 2335
2274 if (!(bh = sb_bread(sb, logical_sb_block))) { 2336 if (!(bh = sb_bread(sb, logical_sb_block))) {
2275 printk(KERN_ERR "EXT4-fs: unable to read superblock\n"); 2337 ext4_msg(sb, KERN_ERR, "unable to read superblock");
2276 goto out_fail; 2338 goto out_fail;
2277 } 2339 }
2278 /* 2340 /*
@@ -2321,6 +2383,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2321 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 2383 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2322 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2384 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2323 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2385 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2386 sbi->s_mb_history_max = default_mb_history_length;
2324 2387
2325 set_opt(sbi->s_mount_opt, BARRIER); 2388 set_opt(sbi->s_mount_opt, BARRIER);
2326 2389
@@ -2330,7 +2393,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2330 */ 2393 */
2331 set_opt(sbi->s_mount_opt, DELALLOC); 2394 set_opt(sbi->s_mount_opt, DELALLOC);
2332 2395
2333
2334 if (!parse_options((char *) data, sb, &journal_devnum, 2396 if (!parse_options((char *) data, sb, &journal_devnum,
2335 &journal_ioprio, NULL, 0)) 2397 &journal_ioprio, NULL, 0))
2336 goto failed_mount; 2398 goto failed_mount;
@@ -2342,9 +2404,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2342 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || 2404 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
2343 EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || 2405 EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
2344 EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) 2406 EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
2345 printk(KERN_WARNING 2407 ext4_msg(sb, KERN_WARNING,
2346 "EXT4-fs warning: feature flags set on rev 0 fs, " 2408 "feature flags set on rev 0 fs, "
2347 "running e2fsck is recommended\n"); 2409 "running e2fsck is recommended");
2348 2410
2349 /* 2411 /*
2350 * Check feature flags regardless of the revision level, since we 2412 * Check feature flags regardless of the revision level, since we
@@ -2353,16 +2415,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2353 */ 2415 */
2354 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2416 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
2355 if (features) { 2417 if (features) {
2356 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " 2418 ext4_msg(sb, KERN_ERR,
2357 "unsupported optional features (%x).\n", sb->s_id, 2419 "Couldn't mount because of "
2420 "unsupported optional features (%x)",
2358 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & 2421 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2359 ~EXT4_FEATURE_INCOMPAT_SUPP)); 2422 ~EXT4_FEATURE_INCOMPAT_SUPP));
2360 goto failed_mount; 2423 goto failed_mount;
2361 } 2424 }
2362 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); 2425 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2363 if (!(sb->s_flags & MS_RDONLY) && features) { 2426 if (!(sb->s_flags & MS_RDONLY) && features) {
2364 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " 2427 ext4_msg(sb, KERN_ERR,
2365 "unsupported optional features (%x).\n", sb->s_id, 2428 "Couldn't mount RDWR because of "
2429 "unsupported optional features (%x)",
2366 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & 2430 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2367 ~EXT4_FEATURE_RO_COMPAT_SUPP)); 2431 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2368 goto failed_mount; 2432 goto failed_mount;
@@ -2376,9 +2440,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2376 */ 2440 */
2377 if (sizeof(root->i_blocks) < sizeof(u64) && 2441 if (sizeof(root->i_blocks) < sizeof(u64) &&
2378 !(sb->s_flags & MS_RDONLY)) { 2442 !(sb->s_flags & MS_RDONLY)) {
2379 printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge " 2443 ext4_msg(sb, KERN_ERR, "Filesystem with huge "
2380 "files cannot be mounted read-write " 2444 "files cannot be mounted read-write "
2381 "without CONFIG_LBD.\n", sb->s_id); 2445 "without CONFIG_LBD");
2382 goto failed_mount; 2446 goto failed_mount;
2383 } 2447 }
2384 } 2448 }
@@ -2386,17 +2450,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2386 2450
2387 if (blocksize < EXT4_MIN_BLOCK_SIZE || 2451 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
2388 blocksize > EXT4_MAX_BLOCK_SIZE) { 2452 blocksize > EXT4_MAX_BLOCK_SIZE) {
2389 printk(KERN_ERR 2453 ext4_msg(sb, KERN_ERR,
2390 "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n", 2454 "Unsupported filesystem blocksize %d", blocksize);
2391 blocksize, sb->s_id);
2392 goto failed_mount; 2455 goto failed_mount;
2393 } 2456 }
2394 2457
2395 if (sb->s_blocksize != blocksize) { 2458 if (sb->s_blocksize != blocksize) {
2396
2397 /* Validate the filesystem blocksize */ 2459 /* Validate the filesystem blocksize */
2398 if (!sb_set_blocksize(sb, blocksize)) { 2460 if (!sb_set_blocksize(sb, blocksize)) {
2399 printk(KERN_ERR "EXT4-fs: bad block size %d.\n", 2461 ext4_msg(sb, KERN_ERR, "bad block size %d",
2400 blocksize); 2462 blocksize);
2401 goto failed_mount; 2463 goto failed_mount;
2402 } 2464 }
@@ -2406,15 +2468,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2406 offset = do_div(logical_sb_block, blocksize); 2468 offset = do_div(logical_sb_block, blocksize);
2407 bh = sb_bread(sb, logical_sb_block); 2469 bh = sb_bread(sb, logical_sb_block);
2408 if (!bh) { 2470 if (!bh) {
2409 printk(KERN_ERR 2471 ext4_msg(sb, KERN_ERR,
2410 "EXT4-fs: Can't read superblock on 2nd try.\n"); 2472 "Can't read superblock on 2nd try");
2411 goto failed_mount; 2473 goto failed_mount;
2412 } 2474 }
2413 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); 2475 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
2414 sbi->s_es = es; 2476 sbi->s_es = es;
2415 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { 2477 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
2416 printk(KERN_ERR 2478 ext4_msg(sb, KERN_ERR,
2417 "EXT4-fs: Magic mismatch, very weird !\n"); 2479 "Magic mismatch, very weird!");
2418 goto failed_mount; 2480 goto failed_mount;
2419 } 2481 }
2420 } 2482 }
@@ -2432,30 +2494,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2432 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || 2494 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
2433 (!is_power_of_2(sbi->s_inode_size)) || 2495 (!is_power_of_2(sbi->s_inode_size)) ||
2434 (sbi->s_inode_size > blocksize)) { 2496 (sbi->s_inode_size > blocksize)) {
2435 printk(KERN_ERR 2497 ext4_msg(sb, KERN_ERR,
2436 "EXT4-fs: unsupported inode size: %d\n", 2498 "unsupported inode size: %d",
2437 sbi->s_inode_size); 2499 sbi->s_inode_size);
2438 goto failed_mount; 2500 goto failed_mount;
2439 } 2501 }
2440 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) 2502 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
2441 sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); 2503 sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
2442 } 2504 }
2505
2443 sbi->s_desc_size = le16_to_cpu(es->s_desc_size); 2506 sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
2444 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { 2507 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
2445 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || 2508 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
2446 sbi->s_desc_size > EXT4_MAX_DESC_SIZE || 2509 sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
2447 !is_power_of_2(sbi->s_desc_size)) { 2510 !is_power_of_2(sbi->s_desc_size)) {
2448 printk(KERN_ERR 2511 ext4_msg(sb, KERN_ERR,
2449 "EXT4-fs: unsupported descriptor size %lu\n", 2512 "unsupported descriptor size %lu",
2450 sbi->s_desc_size); 2513 sbi->s_desc_size);
2451 goto failed_mount; 2514 goto failed_mount;
2452 } 2515 }
2453 } else 2516 } else
2454 sbi->s_desc_size = EXT4_MIN_DESC_SIZE; 2517 sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
2518
2455 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); 2519 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
2456 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 2520 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
2457 if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) 2521 if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
2458 goto cantfind_ext4; 2522 goto cantfind_ext4;
2523
2459 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); 2524 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
2460 if (sbi->s_inodes_per_block == 0) 2525 if (sbi->s_inodes_per_block == 0)
2461 goto cantfind_ext4; 2526 goto cantfind_ext4;
@@ -2466,6 +2531,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2466 sbi->s_mount_state = le16_to_cpu(es->s_state); 2531 sbi->s_mount_state = le16_to_cpu(es->s_state);
2467 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 2532 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
2468 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 2533 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
2534
2469 for (i = 0; i < 4; i++) 2535 for (i = 0; i < 4; i++)
2470 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 2536 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2471 sbi->s_def_hash_version = es->s_def_hash_version; 2537 sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2483,25 +2549,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2483 } 2549 }
2484 2550
2485 if (sbi->s_blocks_per_group > blocksize * 8) { 2551 if (sbi->s_blocks_per_group > blocksize * 8) {
2486 printk(KERN_ERR 2552 ext4_msg(sb, KERN_ERR,
2487 "EXT4-fs: #blocks per group too big: %lu\n", 2553 "#blocks per group too big: %lu",
2488 sbi->s_blocks_per_group); 2554 sbi->s_blocks_per_group);
2489 goto failed_mount; 2555 goto failed_mount;
2490 } 2556 }
2491 if (sbi->s_inodes_per_group > blocksize * 8) { 2557 if (sbi->s_inodes_per_group > blocksize * 8) {
2492 printk(KERN_ERR 2558 ext4_msg(sb, KERN_ERR,
2493 "EXT4-fs: #inodes per group too big: %lu\n", 2559 "#inodes per group too big: %lu",
2494 sbi->s_inodes_per_group); 2560 sbi->s_inodes_per_group);
2495 goto failed_mount; 2561 goto failed_mount;
2496 } 2562 }
2497 2563
2498 if (ext4_blocks_count(es) > 2564 if (ext4_blocks_count(es) >
2499 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 2565 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
2500 printk(KERN_ERR "EXT4-fs: filesystem on %s:" 2566 ext4_msg(sb, KERN_ERR, "filesystem"
2501 " too large to mount safely\n", sb->s_id); 2567 " too large to mount safely");
2502 if (sizeof(sector_t) < 8) 2568 if (sizeof(sector_t) < 8)
2503 printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not " 2569 ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
2504 "enabled\n");
2505 goto failed_mount; 2570 goto failed_mount;
2506 } 2571 }
2507 2572
@@ -2511,21 +2576,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2511 /* check blocks count against device size */ 2576 /* check blocks count against device size */
2512 blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; 2577 blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
2513 if (blocks_count && ext4_blocks_count(es) > blocks_count) { 2578 if (blocks_count && ext4_blocks_count(es) > blocks_count) {
2514 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu " 2579 ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
2515 "exceeds size of device (%llu blocks)\n", 2580 "exceeds size of device (%llu blocks)",
2516 ext4_blocks_count(es), blocks_count); 2581 ext4_blocks_count(es), blocks_count);
2517 goto failed_mount; 2582 goto failed_mount;
2518 } 2583 }
2519 2584
2520 /* 2585 /*
2521 * It makes no sense for the first data block to be beyond the end 2586 * It makes no sense for the first data block to be beyond the end
2522 * of the filesystem. 2587 * of the filesystem.
2523 */ 2588 */
2524 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { 2589 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2525 printk(KERN_WARNING "EXT4-fs: bad geometry: first data" 2590 ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
2526 "block %u is beyond end of filesystem (%llu)\n", 2591 "block %u is beyond end of filesystem (%llu)",
2527 le32_to_cpu(es->s_first_data_block), 2592 le32_to_cpu(es->s_first_data_block),
2528 ext4_blocks_count(es)); 2593 ext4_blocks_count(es));
2529 goto failed_mount; 2594 goto failed_mount;
2530 } 2595 }
2531 blocks_count = (ext4_blocks_count(es) - 2596 blocks_count = (ext4_blocks_count(es) -
@@ -2533,9 +2598,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2533 EXT4_BLOCKS_PER_GROUP(sb) - 1); 2598 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2534 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); 2599 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2535 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { 2600 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2536 printk(KERN_WARNING "EXT4-fs: groups count too large: %u " 2601 ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
2537 "(block count %llu, first data block %u, " 2602 "(block count %llu, first data block %u, "
2538 "blocks per group %lu)\n", sbi->s_groups_count, 2603 "blocks per group %lu)", sbi->s_groups_count,
2539 ext4_blocks_count(es), 2604 ext4_blocks_count(es),
2540 le32_to_cpu(es->s_first_data_block), 2605 le32_to_cpu(es->s_first_data_block),
2541 EXT4_BLOCKS_PER_GROUP(sb)); 2606 EXT4_BLOCKS_PER_GROUP(sb));
@@ -2547,7 +2612,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2547 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), 2612 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
2548 GFP_KERNEL); 2613 GFP_KERNEL);
2549 if (sbi->s_group_desc == NULL) { 2614 if (sbi->s_group_desc == NULL) {
2550 printk(KERN_ERR "EXT4-fs: not enough memory\n"); 2615 ext4_msg(sb, KERN_ERR, "not enough memory");
2551 goto failed_mount; 2616 goto failed_mount;
2552 } 2617 }
2553 2618
@@ -2562,21 +2627,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2562 block = descriptor_loc(sb, logical_sb_block, i); 2627 block = descriptor_loc(sb, logical_sb_block, i);
2563 sbi->s_group_desc[i] = sb_bread(sb, block); 2628 sbi->s_group_desc[i] = sb_bread(sb, block);
2564 if (!sbi->s_group_desc[i]) { 2629 if (!sbi->s_group_desc[i]) {
2565 printk(KERN_ERR "EXT4-fs: " 2630 ext4_msg(sb, KERN_ERR,
2566 "can't read group descriptor %d\n", i); 2631 "can't read group descriptor %d", i);
2567 db_count = i; 2632 db_count = i;
2568 goto failed_mount2; 2633 goto failed_mount2;
2569 } 2634 }
2570 } 2635 }
2571 if (!ext4_check_descriptors(sb)) { 2636 if (!ext4_check_descriptors(sb)) {
2572 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); 2637 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
2573 goto failed_mount2; 2638 goto failed_mount2;
2574 } 2639 }
2575 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 2640 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2576 if (!ext4_fill_flex_info(sb)) { 2641 if (!ext4_fill_flex_info(sb)) {
2577 printk(KERN_ERR 2642 ext4_msg(sb, KERN_ERR,
2578 "EXT4-fs: unable to initialize " 2643 "unable to initialize "
2579 "flex_bg meta info!\n"); 2644 "flex_bg meta info!");
2580 goto failed_mount2; 2645 goto failed_mount2;
2581 } 2646 }
2582 2647
@@ -2598,7 +2663,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2598 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 2663 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2599 } 2664 }
2600 if (err) { 2665 if (err) {
2601 printk(KERN_ERR "EXT4-fs: insufficient memory\n"); 2666 ext4_msg(sb, KERN_ERR, "insufficient memory");
2602 goto failed_mount3; 2667 goto failed_mount3;
2603 } 2668 }
2604 2669
@@ -2607,7 +2672,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2607 /* 2672 /*
2608 * set up enough so that it can read an inode 2673 * set up enough so that it can read an inode
2609 */ 2674 */
2610 sb->s_op = &ext4_sops; 2675 if (!test_opt(sb, NOLOAD) &&
2676 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
2677 sb->s_op = &ext4_sops;
2678 else
2679 sb->s_op = &ext4_nojournal_sops;
2611 sb->s_export_op = &ext4_export_ops; 2680 sb->s_export_op = &ext4_export_ops;
2612 sb->s_xattr = ext4_xattr_handlers; 2681 sb->s_xattr = ext4_xattr_handlers;
2613#ifdef CONFIG_QUOTA 2682#ifdef CONFIG_QUOTA
@@ -2615,6 +2684,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2615 sb->dq_op = &ext4_quota_operations; 2684 sb->dq_op = &ext4_quota_operations;
2616#endif 2685#endif
2617 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 2686 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
2687 mutex_init(&sbi->s_orphan_lock);
2688 mutex_init(&sbi->s_resize_lock);
2618 2689
2619 sb->s_root = NULL; 2690 sb->s_root = NULL;
2620 2691
@@ -2632,13 +2703,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2632 goto failed_mount3; 2703 goto failed_mount3;
2633 if (!(sb->s_flags & MS_RDONLY) && 2704 if (!(sb->s_flags & MS_RDONLY) &&
2634 EXT4_SB(sb)->s_journal->j_failed_commit) { 2705 EXT4_SB(sb)->s_journal->j_failed_commit) {
2635 printk(KERN_CRIT "EXT4-fs error (device %s): " 2706 ext4_msg(sb, KERN_CRIT, "error: "
2636 "ext4_fill_super: Journal transaction " 2707 "ext4_fill_super: Journal transaction "
2637 "%u is corrupt\n", sb->s_id, 2708 "%u is corrupt",
2638 EXT4_SB(sb)->s_journal->j_failed_commit); 2709 EXT4_SB(sb)->s_journal->j_failed_commit);
2639 if (test_opt(sb, ERRORS_RO)) { 2710 if (test_opt(sb, ERRORS_RO)) {
2640 printk(KERN_CRIT 2711 ext4_msg(sb, KERN_CRIT,
2641 "Mounting filesystem read-only\n"); 2712 "Mounting filesystem read-only");
2642 sb->s_flags |= MS_RDONLY; 2713 sb->s_flags |= MS_RDONLY;
2643 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2714 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2644 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2715 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2646,14 +2717,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2646 if (test_opt(sb, ERRORS_PANIC)) { 2717 if (test_opt(sb, ERRORS_PANIC)) {
2647 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2718 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2648 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2719 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2649 ext4_commit_super(sb, es, 1); 2720 ext4_commit_super(sb, 1);
2650 goto failed_mount4; 2721 goto failed_mount4;
2651 } 2722 }
2652 } 2723 }
2653 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && 2724 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
2654 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2725 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2655 printk(KERN_ERR "EXT4-fs: required journal recovery " 2726 ext4_msg(sb, KERN_ERR, "required journal recovery "
2656 "suppressed and not mounted read-only\n"); 2727 "suppressed and not mounted read-only");
2657 goto failed_mount4; 2728 goto failed_mount4;
2658 } else { 2729 } else {
2659 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 2730 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2666,7 +2737,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2666 if (ext4_blocks_count(es) > 0xffffffffULL && 2737 if (ext4_blocks_count(es) > 0xffffffffULL &&
2667 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2738 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2668 JBD2_FEATURE_INCOMPAT_64BIT)) { 2739 JBD2_FEATURE_INCOMPAT_64BIT)) {
2669 printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n"); 2740 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
2670 goto failed_mount4; 2741 goto failed_mount4;
2671 } 2742 }
2672 2743
@@ -2704,8 +2775,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2704 case EXT4_MOUNT_WRITEBACK_DATA: 2775 case EXT4_MOUNT_WRITEBACK_DATA:
2705 if (!jbd2_journal_check_available_features 2776 if (!jbd2_journal_check_available_features
2706 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 2777 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2707 printk(KERN_ERR "EXT4-fs: Journal does not support " 2778 ext4_msg(sb, KERN_ERR, "Journal does not support "
2708 "requested data journaling mode\n"); 2779 "requested data journaling mode");
2709 goto failed_mount4; 2780 goto failed_mount4;
2710 } 2781 }
2711 default: 2782 default:
@@ -2717,8 +2788,8 @@ no_journal:
2717 2788
2718 if (test_opt(sb, NOBH)) { 2789 if (test_opt(sb, NOBH)) {
2719 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2790 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2720 printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - " 2791 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2721 "its supported only with writeback mode\n"); 2792 "its supported only with writeback mode");
2722 clear_opt(sbi->s_mount_opt, NOBH); 2793 clear_opt(sbi->s_mount_opt, NOBH);
2723 } 2794 }
2724 } 2795 }
@@ -2729,18 +2800,18 @@ no_journal:
2729 2800
2730 root = ext4_iget(sb, EXT4_ROOT_INO); 2801 root = ext4_iget(sb, EXT4_ROOT_INO);
2731 if (IS_ERR(root)) { 2802 if (IS_ERR(root)) {
2732 printk(KERN_ERR "EXT4-fs: get root inode failed\n"); 2803 ext4_msg(sb, KERN_ERR, "get root inode failed");
2733 ret = PTR_ERR(root); 2804 ret = PTR_ERR(root);
2734 goto failed_mount4; 2805 goto failed_mount4;
2735 } 2806 }
2736 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 2807 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
2737 iput(root); 2808 iput(root);
2738 printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n"); 2809 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
2739 goto failed_mount4; 2810 goto failed_mount4;
2740 } 2811 }
2741 sb->s_root = d_alloc_root(root); 2812 sb->s_root = d_alloc_root(root);
2742 if (!sb->s_root) { 2813 if (!sb->s_root) {
2743 printk(KERN_ERR "EXT4-fs: get root dentry failed\n"); 2814 ext4_msg(sb, KERN_ERR, "get root dentry failed");
2744 iput(root); 2815 iput(root);
2745 ret = -ENOMEM; 2816 ret = -ENOMEM;
2746 goto failed_mount4; 2817 goto failed_mount4;
@@ -2769,22 +2840,29 @@ no_journal:
2769 sbi->s_inode_size) { 2840 sbi->s_inode_size) {
2770 sbi->s_want_extra_isize = sizeof(struct ext4_inode) - 2841 sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
2771 EXT4_GOOD_OLD_INODE_SIZE; 2842 EXT4_GOOD_OLD_INODE_SIZE;
2772 printk(KERN_INFO "EXT4-fs: required extra inode space not" 2843 ext4_msg(sb, KERN_INFO, "required extra inode space not"
2773 "available.\n"); 2844 "available");
2774 } 2845 }
2775 2846
2776 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2847 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2777 printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " 2848 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
2778 "requested data journaling mode\n"); 2849 "requested data journaling mode");
2779 clear_opt(sbi->s_mount_opt, DELALLOC); 2850 clear_opt(sbi->s_mount_opt, DELALLOC);
2780 } else if (test_opt(sb, DELALLOC)) 2851 } else if (test_opt(sb, DELALLOC))
2781 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); 2852 ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
2853
2854 err = ext4_setup_system_zone(sb);
2855 if (err) {
2856 ext4_msg(sb, KERN_ERR, "failed to initialize system "
2857 "zone (%d)\n", err);
2858 goto failed_mount4;
2859 }
2782 2860
2783 ext4_ext_init(sb); 2861 ext4_ext_init(sb);
2784 err = ext4_mb_init(sb, needs_recovery); 2862 err = ext4_mb_init(sb, needs_recovery);
2785 if (err) { 2863 if (err) {
2786 printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", 2864 ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
2787 err); 2865 err);
2788 goto failed_mount4; 2866 goto failed_mount4;
2789 } 2867 }
2790 2868
@@ -2798,19 +2876,11 @@ no_journal:
2798 goto failed_mount4; 2876 goto failed_mount4;
2799 }; 2877 };
2800 2878
2801 /*
2802 * akpm: core read_super() calls in here with the superblock locked.
2803 * That deadlocks, because orphan cleanup needs to lock the superblock
2804 * in numerous places. Here we just pop the lock - it's relatively
2805 * harmless, because we are now ready to accept write_super() requests,
2806 * and aviro says that's the only reason for hanging onto the
2807 * superblock lock.
2808 */
2809 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 2879 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
2810 ext4_orphan_cleanup(sb, es); 2880 ext4_orphan_cleanup(sb, es);
2811 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; 2881 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
2812 if (needs_recovery) { 2882 if (needs_recovery) {
2813 printk(KERN_INFO "EXT4-fs: recovery complete.\n"); 2883 ext4_msg(sb, KERN_INFO, "recovery complete");
2814 ext4_mark_recovery_complete(sb, es); 2884 ext4_mark_recovery_complete(sb, es);
2815 } 2885 }
2816 if (EXT4_SB(sb)->s_journal) { 2886 if (EXT4_SB(sb)->s_journal) {
@@ -2823,25 +2893,30 @@ no_journal:
2823 } else 2893 } else
2824 descr = "out journal"; 2894 descr = "out journal";
2825 2895
2826 printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n", 2896 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
2827 sb->s_id, descr);
2828 2897
2829 lock_kernel(); 2898 lock_kernel();
2830 return 0; 2899 return 0;
2831 2900
2832cantfind_ext4: 2901cantfind_ext4:
2833 if (!silent) 2902 if (!silent)
2834 printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n", 2903 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
2835 sb->s_id);
2836 goto failed_mount; 2904 goto failed_mount;
2837 2905
2838failed_mount4: 2906failed_mount4:
2839 printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); 2907 ext4_msg(sb, KERN_ERR, "mount failed");
2908 ext4_release_system_zone(sb);
2840 if (sbi->s_journal) { 2909 if (sbi->s_journal) {
2841 jbd2_journal_destroy(sbi->s_journal); 2910 jbd2_journal_destroy(sbi->s_journal);
2842 sbi->s_journal = NULL; 2911 sbi->s_journal = NULL;
2843 } 2912 }
2844failed_mount3: 2913failed_mount3:
2914 if (sbi->s_flex_groups) {
2915 if (is_vmalloc_addr(sbi->s_flex_groups))
2916 vfree(sbi->s_flex_groups);
2917 else
2918 kfree(sbi->s_flex_groups);
2919 }
2845 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2920 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2846 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2921 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2847 percpu_counter_destroy(&sbi->s_dirs_counter); 2922 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -2862,6 +2937,7 @@ failed_mount:
2862 brelse(bh); 2937 brelse(bh);
2863out_fail: 2938out_fail:
2864 sb->s_fs_info = NULL; 2939 sb->s_fs_info = NULL;
2940 kfree(sbi->s_blockgroup_lock);
2865 kfree(sbi); 2941 kfree(sbi);
2866 lock_kernel(); 2942 lock_kernel();
2867 return ret; 2943 return ret;
@@ -2906,27 +2982,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2906 2982
2907 journal_inode = ext4_iget(sb, journal_inum); 2983 journal_inode = ext4_iget(sb, journal_inum);
2908 if (IS_ERR(journal_inode)) { 2984 if (IS_ERR(journal_inode)) {
2909 printk(KERN_ERR "EXT4-fs: no journal found.\n"); 2985 ext4_msg(sb, KERN_ERR, "no journal found");
2910 return NULL; 2986 return NULL;
2911 } 2987 }
2912 if (!journal_inode->i_nlink) { 2988 if (!journal_inode->i_nlink) {
2913 make_bad_inode(journal_inode); 2989 make_bad_inode(journal_inode);
2914 iput(journal_inode); 2990 iput(journal_inode);
2915 printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n"); 2991 ext4_msg(sb, KERN_ERR, "journal inode is deleted");
2916 return NULL; 2992 return NULL;
2917 } 2993 }
2918 2994
2919 jbd_debug(2, "Journal inode found at %p: %lld bytes\n", 2995 jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
2920 journal_inode, journal_inode->i_size); 2996 journal_inode, journal_inode->i_size);
2921 if (!S_ISREG(journal_inode->i_mode)) { 2997 if (!S_ISREG(journal_inode->i_mode)) {
2922 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); 2998 ext4_msg(sb, KERN_ERR, "invalid journal inode");
2923 iput(journal_inode); 2999 iput(journal_inode);
2924 return NULL; 3000 return NULL;
2925 } 3001 }
2926 3002
2927 journal = jbd2_journal_init_inode(journal_inode); 3003 journal = jbd2_journal_init_inode(journal_inode);
2928 if (!journal) { 3004 if (!journal) {
2929 printk(KERN_ERR "EXT4-fs: Could not load journal inode\n"); 3005 ext4_msg(sb, KERN_ERR, "Could not load journal inode");
2930 iput(journal_inode); 3006 iput(journal_inode);
2931 return NULL; 3007 return NULL;
2932 } 3008 }
@@ -2950,22 +3026,22 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2950 3026
2951 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); 3027 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2952 3028
2953 bdev = ext4_blkdev_get(j_dev); 3029 bdev = ext4_blkdev_get(j_dev, sb);
2954 if (bdev == NULL) 3030 if (bdev == NULL)
2955 return NULL; 3031 return NULL;
2956 3032
2957 if (bd_claim(bdev, sb)) { 3033 if (bd_claim(bdev, sb)) {
2958 printk(KERN_ERR 3034 ext4_msg(sb, KERN_ERR,
2959 "EXT4-fs: failed to claim external journal device.\n"); 3035 "failed to claim external journal device");
2960 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 3036 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2961 return NULL; 3037 return NULL;
2962 } 3038 }
2963 3039
2964 blocksize = sb->s_blocksize; 3040 blocksize = sb->s_blocksize;
2965 hblock = bdev_hardsect_size(bdev); 3041 hblock = bdev_logical_block_size(bdev);
2966 if (blocksize < hblock) { 3042 if (blocksize < hblock) {
2967 printk(KERN_ERR 3043 ext4_msg(sb, KERN_ERR,
2968 "EXT4-fs: blocksize too small for journal device.\n"); 3044 "blocksize too small for journal device");
2969 goto out_bdev; 3045 goto out_bdev;
2970 } 3046 }
2971 3047
@@ -2973,8 +3049,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2973 offset = EXT4_MIN_BLOCK_SIZE % blocksize; 3049 offset = EXT4_MIN_BLOCK_SIZE % blocksize;
2974 set_blocksize(bdev, blocksize); 3050 set_blocksize(bdev, blocksize);
2975 if (!(bh = __bread(bdev, sb_block, blocksize))) { 3051 if (!(bh = __bread(bdev, sb_block, blocksize))) {
2976 printk(KERN_ERR "EXT4-fs: couldn't read superblock of " 3052 ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
2977 "external journal\n"); 3053 "external journal");
2978 goto out_bdev; 3054 goto out_bdev;
2979 } 3055 }
2980 3056
@@ -2982,14 +3058,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2982 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || 3058 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
2983 !(le32_to_cpu(es->s_feature_incompat) & 3059 !(le32_to_cpu(es->s_feature_incompat) &
2984 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { 3060 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
2985 printk(KERN_ERR "EXT4-fs: external journal has " 3061 ext4_msg(sb, KERN_ERR, "external journal has "
2986 "bad superblock\n"); 3062 "bad superblock");
2987 brelse(bh); 3063 brelse(bh);
2988 goto out_bdev; 3064 goto out_bdev;
2989 } 3065 }
2990 3066
2991 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { 3067 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
2992 printk(KERN_ERR "EXT4-fs: journal UUID does not match\n"); 3068 ext4_msg(sb, KERN_ERR, "journal UUID does not match");
2993 brelse(bh); 3069 brelse(bh);
2994 goto out_bdev; 3070 goto out_bdev;
2995 } 3071 }
@@ -3001,25 +3077,26 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
3001 journal = jbd2_journal_init_dev(bdev, sb->s_bdev, 3077 journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
3002 start, len, blocksize); 3078 start, len, blocksize);
3003 if (!journal) { 3079 if (!journal) {
3004 printk(KERN_ERR "EXT4-fs: failed to create device journal\n"); 3080 ext4_msg(sb, KERN_ERR, "failed to create device journal");
3005 goto out_bdev; 3081 goto out_bdev;
3006 } 3082 }
3007 journal->j_private = sb; 3083 journal->j_private = sb;
3008 ll_rw_block(READ, 1, &journal->j_sb_buffer); 3084 ll_rw_block(READ, 1, &journal->j_sb_buffer);
3009 wait_on_buffer(journal->j_sb_buffer); 3085 wait_on_buffer(journal->j_sb_buffer);
3010 if (!buffer_uptodate(journal->j_sb_buffer)) { 3086 if (!buffer_uptodate(journal->j_sb_buffer)) {
3011 printk(KERN_ERR "EXT4-fs: I/O error on journal device\n"); 3087 ext4_msg(sb, KERN_ERR, "I/O error on journal device");
3012 goto out_journal; 3088 goto out_journal;
3013 } 3089 }
3014 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { 3090 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
3015 printk(KERN_ERR "EXT4-fs: External journal has more than one " 3091 ext4_msg(sb, KERN_ERR, "External journal has more than one "
3016 "user (unsupported) - %d\n", 3092 "user (unsupported) - %d",
3017 be32_to_cpu(journal->j_superblock->s_nr_users)); 3093 be32_to_cpu(journal->j_superblock->s_nr_users));
3018 goto out_journal; 3094 goto out_journal;
3019 } 3095 }
3020 EXT4_SB(sb)->journal_bdev = bdev; 3096 EXT4_SB(sb)->journal_bdev = bdev;
3021 ext4_init_journal_params(sb, journal); 3097 ext4_init_journal_params(sb, journal);
3022 return journal; 3098 return journal;
3099
3023out_journal: 3100out_journal:
3024 jbd2_journal_destroy(journal); 3101 jbd2_journal_destroy(journal);
3025out_bdev: 3102out_bdev:
@@ -3041,8 +3118,8 @@ static int ext4_load_journal(struct super_block *sb,
3041 3118
3042 if (journal_devnum && 3119 if (journal_devnum &&
3043 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 3120 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
3044 printk(KERN_INFO "EXT4-fs: external journal device major/minor " 3121 ext4_msg(sb, KERN_INFO, "external journal device major/minor "
3045 "numbers have changed\n"); 3122 "numbers have changed");
3046 journal_dev = new_decode_dev(journal_devnum); 3123 journal_dev = new_decode_dev(journal_devnum);
3047 } else 3124 } else
3048 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); 3125 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3054,24 +3131,23 @@ static int ext4_load_journal(struct super_block *sb,
3054 * crash? For recovery, we need to check in advance whether we 3131 * crash? For recovery, we need to check in advance whether we
3055 * can get read-write access to the device. 3132 * can get read-write access to the device.
3056 */ 3133 */
3057
3058 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 3134 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
3059 if (sb->s_flags & MS_RDONLY) { 3135 if (sb->s_flags & MS_RDONLY) {
3060 printk(KERN_INFO "EXT4-fs: INFO: recovery " 3136 ext4_msg(sb, KERN_INFO, "INFO: recovery "
3061 "required on readonly filesystem.\n"); 3137 "required on readonly filesystem");
3062 if (really_read_only) { 3138 if (really_read_only) {
3063 printk(KERN_ERR "EXT4-fs: write access " 3139 ext4_msg(sb, KERN_ERR, "write access "
3064 "unavailable, cannot proceed.\n"); 3140 "unavailable, cannot proceed");
3065 return -EROFS; 3141 return -EROFS;
3066 } 3142 }
3067 printk(KERN_INFO "EXT4-fs: write access will " 3143 ext4_msg(sb, KERN_INFO, "write access will "
3068 "be enabled during recovery.\n"); 3144 "be enabled during recovery");
3069 } 3145 }
3070 } 3146 }
3071 3147
3072 if (journal_inum && journal_dev) { 3148 if (journal_inum && journal_dev) {
3073 printk(KERN_ERR "EXT4-fs: filesystem has both journal " 3149 ext4_msg(sb, KERN_ERR, "filesystem has both journal "
3074 "and inode journals!\n"); 3150 "and inode journals!");
3075 return -EINVAL; 3151 return -EINVAL;
3076 } 3152 }
3077 3153
@@ -3084,14 +3160,14 @@ static int ext4_load_journal(struct super_block *sb,
3084 } 3160 }
3085 3161
3086 if (journal->j_flags & JBD2_BARRIER) 3162 if (journal->j_flags & JBD2_BARRIER)
3087 printk(KERN_INFO "EXT4-fs: barriers enabled\n"); 3163 ext4_msg(sb, KERN_INFO, "barriers enabled");
3088 else 3164 else
3089 printk(KERN_INFO "EXT4-fs: barriers disabled\n"); 3165 ext4_msg(sb, KERN_INFO, "barriers disabled");
3090 3166
3091 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 3167 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
3092 err = jbd2_journal_update_format(journal); 3168 err = jbd2_journal_update_format(journal);
3093 if (err) { 3169 if (err) {
3094 printk(KERN_ERR "EXT4-fs: error updating journal.\n"); 3170 ext4_msg(sb, KERN_ERR, "error updating journal");
3095 jbd2_journal_destroy(journal); 3171 jbd2_journal_destroy(journal);
3096 return err; 3172 return err;
3097 } 3173 }
@@ -3103,7 +3179,7 @@ static int ext4_load_journal(struct super_block *sb,
3103 err = jbd2_journal_load(journal); 3179 err = jbd2_journal_load(journal);
3104 3180
3105 if (err) { 3181 if (err) {
3106 printk(KERN_ERR "EXT4-fs: error loading journal.\n"); 3182 ext4_msg(sb, KERN_ERR, "error loading journal");
3107 jbd2_journal_destroy(journal); 3183 jbd2_journal_destroy(journal);
3108 return err; 3184 return err;
3109 } 3185 }
@@ -3114,18 +3190,17 @@ static int ext4_load_journal(struct super_block *sb,
3114 if (journal_devnum && 3190 if (journal_devnum &&
3115 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 3191 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
3116 es->s_journal_dev = cpu_to_le32(journal_devnum); 3192 es->s_journal_dev = cpu_to_le32(journal_devnum);
3117 sb->s_dirt = 1;
3118 3193
3119 /* Make sure we flush the recovery flag to disk. */ 3194 /* Make sure we flush the recovery flag to disk. */
3120 ext4_commit_super(sb, es, 1); 3195 ext4_commit_super(sb, 1);
3121 } 3196 }
3122 3197
3123 return 0; 3198 return 0;
3124} 3199}
3125 3200
3126static int ext4_commit_super(struct super_block *sb, 3201static int ext4_commit_super(struct super_block *sb, int sync)
3127 struct ext4_super_block *es, int sync)
3128{ 3202{
3203 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
3129 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 3204 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
3130 int error = 0; 3205 int error = 0;
3131 3206
@@ -3140,8 +3215,8 @@ static int ext4_commit_super(struct super_block *sb,
3140 * be remapped. Nothing we can do but to retry the 3215 * be remapped. Nothing we can do but to retry the
3141 * write and hope for the best. 3216 * write and hope for the best.
3142 */ 3217 */
3143 printk(KERN_ERR "EXT4-fs: previous I/O error to " 3218 ext4_msg(sb, KERN_ERR, "previous I/O error to "
3144 "superblock detected for %s.\n", sb->s_id); 3219 "superblock detected");
3145 clear_buffer_write_io_error(sbh); 3220 clear_buffer_write_io_error(sbh);
3146 set_buffer_uptodate(sbh); 3221 set_buffer_uptodate(sbh);
3147 } 3222 }
@@ -3154,7 +3229,7 @@ static int ext4_commit_super(struct super_block *sb,
3154 &EXT4_SB(sb)->s_freeblocks_counter)); 3229 &EXT4_SB(sb)->s_freeblocks_counter));
3155 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3230 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
3156 &EXT4_SB(sb)->s_freeinodes_counter)); 3231 &EXT4_SB(sb)->s_freeinodes_counter));
3157 3232 sb->s_dirt = 0;
3158 BUFFER_TRACE(sbh, "marking dirty"); 3233 BUFFER_TRACE(sbh, "marking dirty");
3159 mark_buffer_dirty(sbh); 3234 mark_buffer_dirty(sbh);
3160 if (sync) { 3235 if (sync) {
@@ -3164,8 +3239,8 @@ static int ext4_commit_super(struct super_block *sb,
3164 3239
3165 error = buffer_write_io_error(sbh); 3240 error = buffer_write_io_error(sbh);
3166 if (error) { 3241 if (error) {
3167 printk(KERN_ERR "EXT4-fs: I/O error while writing " 3242 ext4_msg(sb, KERN_ERR, "I/O error while writing "
3168 "superblock for %s.\n", sb->s_id); 3243 "superblock");
3169 clear_buffer_write_io_error(sbh); 3244 clear_buffer_write_io_error(sbh);
3170 set_buffer_uptodate(sbh); 3245 set_buffer_uptodate(sbh);
3171 } 3246 }
@@ -3173,7 +3248,6 @@ static int ext4_commit_super(struct super_block *sb,
3173 return error; 3248 return error;
3174} 3249}
3175 3250
3176
3177/* 3251/*
3178 * Have we just finished recovery? If so, and if we are mounting (or 3252 * Have we just finished recovery? If so, and if we are mounting (or
3179 * remounting) the filesystem readonly, then we will end up with a 3253 * remounting) the filesystem readonly, then we will end up with a
@@ -3192,14 +3266,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
3192 if (jbd2_journal_flush(journal) < 0) 3266 if (jbd2_journal_flush(journal) < 0)
3193 goto out; 3267 goto out;
3194 3268
3195 lock_super(sb);
3196 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && 3269 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
3197 sb->s_flags & MS_RDONLY) { 3270 sb->s_flags & MS_RDONLY) {
3198 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3271 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3199 sb->s_dirt = 0; 3272 ext4_commit_super(sb, 1);
3200 ext4_commit_super(sb, es, 1);
3201 } 3273 }
3202 unlock_super(sb);
3203 3274
3204out: 3275out:
3205 jbd2_journal_unlock_updates(journal); 3276 jbd2_journal_unlock_updates(journal);
@@ -3238,7 +3309,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
3238 3309
3239 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 3310 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
3240 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 3311 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
3241 ext4_commit_super(sb, es, 1); 3312 ext4_commit_super(sb, 1);
3242 3313
3243 jbd2_journal_clear_err(journal); 3314 jbd2_journal_clear_err(journal);
3244 } 3315 }
@@ -3257,29 +3328,17 @@ int ext4_force_commit(struct super_block *sb)
3257 return 0; 3328 return 0;
3258 3329
3259 journal = EXT4_SB(sb)->s_journal; 3330 journal = EXT4_SB(sb)->s_journal;
3260 if (journal) { 3331 if (journal)
3261 sb->s_dirt = 0;
3262 ret = ext4_journal_force_commit(journal); 3332 ret = ext4_journal_force_commit(journal);
3263 }
3264 3333
3265 return ret; 3334 return ret;
3266} 3335}
3267 3336
3268/*
3269 * Ext4 always journals updates to the superblock itself, so we don't
3270 * have to propagate any other updates to the superblock on disk at this
3271 * point. (We can probably nuke this function altogether, and remove
3272 * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
3273 */
3274static void ext4_write_super(struct super_block *sb) 3337static void ext4_write_super(struct super_block *sb)
3275{ 3338{
3276 if (EXT4_SB(sb)->s_journal) { 3339 lock_super(sb);
3277 if (mutex_trylock(&sb->s_lock) != 0) 3340 ext4_commit_super(sb, 1);
3278 BUG(); 3341 unlock_super(sb);
3279 sb->s_dirt = 0;
3280 } else {
3281 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3282 }
3283} 3342}
3284 3343
3285static int ext4_sync_fs(struct super_block *sb, int wait) 3344static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3288,16 +3347,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3288 tid_t target; 3347 tid_t target;
3289 3348
3290 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); 3349 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
3291 sb->s_dirt = 0; 3350 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
3292 if (EXT4_SB(sb)->s_journal) { 3351 if (wait)
3293 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, 3352 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
3294 &target)) {
3295 if (wait)
3296 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
3297 target);
3298 }
3299 } else {
3300 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
3301 } 3353 }
3302 return ret; 3354 return ret;
3303} 3355}
@@ -3310,34 +3362,32 @@ static int ext4_freeze(struct super_block *sb)
3310{ 3362{
3311 int error = 0; 3363 int error = 0;
3312 journal_t *journal; 3364 journal_t *journal;
3313 sb->s_dirt = 0;
3314 3365
3315 if (!(sb->s_flags & MS_RDONLY)) { 3366 if (sb->s_flags & MS_RDONLY)
3316 journal = EXT4_SB(sb)->s_journal; 3367 return 0;
3317 3368
3318 if (journal) { 3369 journal = EXT4_SB(sb)->s_journal;
3319 /* Now we set up the journal barrier. */
3320 jbd2_journal_lock_updates(journal);
3321 3370
3322 /* 3371 /* Now we set up the journal barrier. */
3323 * We don't want to clear needs_recovery flag when we 3372 jbd2_journal_lock_updates(journal);
3324 * failed to flush the journal.
3325 */
3326 error = jbd2_journal_flush(journal);
3327 if (error < 0)
3328 goto out;
3329 }
3330 3373
3331 /* Journal blocked and flushed, clear needs_recovery flag. */ 3374 /*
3332 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3375 * Don't clear the needs_recovery flag if we failed to flush
3333 error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); 3376 * the journal.
3334 if (error) 3377 */
3335 goto out; 3378 error = jbd2_journal_flush(journal);
3379 if (error < 0) {
3380 out:
3381 jbd2_journal_unlock_updates(journal);
3382 return error;
3336 } 3383 }
3384
3385 /* Journal blocked and flushed, clear needs_recovery flag. */
3386 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3387 error = ext4_commit_super(sb, 1);
3388 if (error)
3389 goto out;
3337 return 0; 3390 return 0;
3338out:
3339 jbd2_journal_unlock_updates(journal);
3340 return error;
3341} 3391}
3342 3392
3343/* 3393/*
@@ -3346,14 +3396,15 @@ out:
3346 */ 3396 */
3347static int ext4_unfreeze(struct super_block *sb) 3397static int ext4_unfreeze(struct super_block *sb)
3348{ 3398{
3349 if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { 3399 if (sb->s_flags & MS_RDONLY)
3350 lock_super(sb); 3400 return 0;
3351 /* Reser the needs_recovery flag before the fs is unlocked. */ 3401
3352 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3402 lock_super(sb);
3353 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); 3403 /* Reset the needs_recovery flag before the fs is unlocked. */
3354 unlock_super(sb); 3404 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3355 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 3405 ext4_commit_super(sb, 1);
3356 } 3406 unlock_super(sb);
3407 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3357 return 0; 3408 return 0;
3358} 3409}
3359 3410
@@ -3371,7 +3422,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3371 int i; 3422 int i;
3372#endif 3423#endif
3373 3424
3425 lock_kernel();
3426
3374 /* Store the original options */ 3427 /* Store the original options */
3428 lock_super(sb);
3375 old_sb_flags = sb->s_flags; 3429 old_sb_flags = sb->s_flags;
3376 old_opts.s_mount_opt = sbi->s_mount_opt; 3430 old_opts.s_mount_opt = sbi->s_mount_opt;
3377 old_opts.s_resuid = sbi->s_resuid; 3431 old_opts.s_resuid = sbi->s_resuid;
@@ -3432,22 +3486,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3432 (sbi->s_mount_state & EXT4_VALID_FS)) 3486 (sbi->s_mount_state & EXT4_VALID_FS))
3433 es->s_state = cpu_to_le16(sbi->s_mount_state); 3487 es->s_state = cpu_to_le16(sbi->s_mount_state);
3434 3488
3435 /* 3489 if (sbi->s_journal)
3436 * We have to unlock super so that we can wait for
3437 * transactions.
3438 */
3439 if (sbi->s_journal) {
3440 unlock_super(sb);
3441 ext4_mark_recovery_complete(sb, es); 3490 ext4_mark_recovery_complete(sb, es);
3442 lock_super(sb);
3443 }
3444 } else { 3491 } else {
3445 int ret; 3492 int ret;
3446 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3493 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3447 ~EXT4_FEATURE_RO_COMPAT_SUPP))) { 3494 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3448 printk(KERN_WARNING "EXT4-fs: %s: couldn't " 3495 ext4_msg(sb, KERN_WARNING, "couldn't "
3449 "remount RDWR because of unsupported " 3496 "remount RDWR because of unsupported "
3450 "optional features (%x).\n", sb->s_id, 3497 "optional features (%x)",
3451 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & 3498 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3452 ~EXT4_FEATURE_RO_COMPAT_SUPP)); 3499 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3453 err = -EROFS; 3500 err = -EROFS;
@@ -3456,17 +3503,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3456 3503
3457 /* 3504 /*
3458 * Make sure the group descriptor checksums 3505 * Make sure the group descriptor checksums
3459 * are sane. If they aren't, refuse to 3506 * are sane. If they aren't, refuse to remount r/w.
3460 * remount r/w.
3461 */ 3507 */
3462 for (g = 0; g < sbi->s_groups_count; g++) { 3508 for (g = 0; g < sbi->s_groups_count; g++) {
3463 struct ext4_group_desc *gdp = 3509 struct ext4_group_desc *gdp =
3464 ext4_get_group_desc(sb, g, NULL); 3510 ext4_get_group_desc(sb, g, NULL);
3465 3511
3466 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { 3512 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
3467 printk(KERN_ERR 3513 ext4_msg(sb, KERN_ERR,
3468 "EXT4-fs: ext4_remount: " 3514 "ext4_remount: Checksum for group %u failed (%u!=%u)",
3469 "Checksum for group %u failed (%u!=%u)\n",
3470 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), 3515 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
3471 le16_to_cpu(gdp->bg_checksum)); 3516 le16_to_cpu(gdp->bg_checksum));
3472 err = -EINVAL; 3517 err = -EINVAL;
@@ -3480,11 +3525,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3480 * require a full umount/remount for now. 3525 * require a full umount/remount for now.
3481 */ 3526 */
3482 if (es->s_last_orphan) { 3527 if (es->s_last_orphan) {
3483 printk(KERN_WARNING "EXT4-fs: %s: couldn't " 3528 ext4_msg(sb, KERN_WARNING, "Couldn't "
3484 "remount RDWR because of unprocessed " 3529 "remount RDWR because of unprocessed "
3485 "orphan inode list. Please " 3530 "orphan inode list. Please "
3486 "umount/remount instead.\n", 3531 "umount/remount instead");
3487 sb->s_id);
3488 err = -EINVAL; 3532 err = -EINVAL;
3489 goto restore_opts; 3533 goto restore_opts;
3490 } 3534 }
@@ -3504,8 +3548,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3504 sb->s_flags &= ~MS_RDONLY; 3548 sb->s_flags &= ~MS_RDONLY;
3505 } 3549 }
3506 } 3550 }
3551 ext4_setup_system_zone(sb);
3507 if (sbi->s_journal == NULL) 3552 if (sbi->s_journal == NULL)
3508 ext4_commit_super(sb, es, 1); 3553 ext4_commit_super(sb, 1);
3509 3554
3510#ifdef CONFIG_QUOTA 3555#ifdef CONFIG_QUOTA
3511 /* Release old quota file names */ 3556 /* Release old quota file names */
@@ -3514,7 +3559,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3514 old_opts.s_qf_names[i] != sbi->s_qf_names[i]) 3559 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
3515 kfree(old_opts.s_qf_names[i]); 3560 kfree(old_opts.s_qf_names[i]);
3516#endif 3561#endif
3562 unlock_super(sb);
3563 unlock_kernel();
3517 return 0; 3564 return 0;
3565
3518restore_opts: 3566restore_opts:
3519 sb->s_flags = old_sb_flags; 3567 sb->s_flags = old_sb_flags;
3520 sbi->s_mount_opt = old_opts.s_mount_opt; 3568 sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3532,6 +3580,8 @@ restore_opts:
3532 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 3580 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
3533 } 3581 }
3534#endif 3582#endif
3583 unlock_super(sb);
3584 unlock_kernel();
3535 return err; 3585 return err;
3536} 3586}
3537 3587
@@ -3545,9 +3595,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3545 if (test_opt(sb, MINIX_DF)) { 3595 if (test_opt(sb, MINIX_DF)) {
3546 sbi->s_overhead_last = 0; 3596 sbi->s_overhead_last = 0;
3547 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { 3597 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
3548 ext4_group_t ngroups = sbi->s_groups_count, i; 3598 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3549 ext4_fsblk_t overhead = 0; 3599 ext4_fsblk_t overhead = 0;
3550 smp_rmb();
3551 3600
3552 /* 3601 /*
3553 * Compute the overhead (FS structures). This is constant 3602 * Compute the overhead (FS structures). This is constant
@@ -3599,11 +3648,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3599 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 3648 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
3600 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; 3649 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
3601 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; 3650 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
3651
3602 return 0; 3652 return 0;
3603} 3653}
3604 3654
3605/* Helper function for writing quotas on sync - we need to start transaction before quota file 3655/* Helper function for writing quotas on sync - we need to start transaction
3606 * is locked for write. Otherwise the are possible deadlocks: 3656 * before quota file is locked for write. Otherwise the are possible deadlocks:
3607 * Process 1 Process 2 3657 * Process 1 Process 2
3608 * ext4_create() quota_sync() 3658 * ext4_create() quota_sync()
3609 * jbd2_journal_start() write_dquot() 3659 * jbd2_journal_start() write_dquot()
@@ -3627,7 +3677,7 @@ static int ext4_write_dquot(struct dquot *dquot)
3627 3677
3628 inode = dquot_to_inode(dquot); 3678 inode = dquot_to_inode(dquot);
3629 handle = ext4_journal_start(inode, 3679 handle = ext4_journal_start(inode,
3630 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 3680 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
3631 if (IS_ERR(handle)) 3681 if (IS_ERR(handle))
3632 return PTR_ERR(handle); 3682 return PTR_ERR(handle);
3633 ret = dquot_commit(dquot); 3683 ret = dquot_commit(dquot);
@@ -3643,7 +3693,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
3643 handle_t *handle; 3693 handle_t *handle;
3644 3694
3645 handle = ext4_journal_start(dquot_to_inode(dquot), 3695 handle = ext4_journal_start(dquot_to_inode(dquot),
3646 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 3696 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
3647 if (IS_ERR(handle)) 3697 if (IS_ERR(handle))
3648 return PTR_ERR(handle); 3698 return PTR_ERR(handle);
3649 ret = dquot_acquire(dquot); 3699 ret = dquot_acquire(dquot);
@@ -3659,7 +3709,7 @@ static int ext4_release_dquot(struct dquot *dquot)
3659 handle_t *handle; 3709 handle_t *handle;
3660 3710
3661 handle = ext4_journal_start(dquot_to_inode(dquot), 3711 handle = ext4_journal_start(dquot_to_inode(dquot),
3662 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); 3712 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
3663 if (IS_ERR(handle)) { 3713 if (IS_ERR(handle)) {
3664 /* Release dquot anyway to avoid endless cycle in dqput() */ 3714 /* Release dquot anyway to avoid endless cycle in dqput() */
3665 dquot_release(dquot); 3715 dquot_release(dquot);
@@ -3707,7 +3757,7 @@ static int ext4_write_info(struct super_block *sb, int type)
3707static int ext4_quota_on_mount(struct super_block *sb, int type) 3757static int ext4_quota_on_mount(struct super_block *sb, int type)
3708{ 3758{
3709 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], 3759 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
3710 EXT4_SB(sb)->s_jquota_fmt, type); 3760 EXT4_SB(sb)->s_jquota_fmt, type);
3711} 3761}
3712 3762
3713/* 3763/*
@@ -3738,9 +3788,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3738 if (EXT4_SB(sb)->s_qf_names[type]) { 3788 if (EXT4_SB(sb)->s_qf_names[type]) {
3739 /* Quotafile not in fs root? */ 3789 /* Quotafile not in fs root? */
3740 if (path.dentry->d_parent != sb->s_root) 3790 if (path.dentry->d_parent != sb->s_root)
3741 printk(KERN_WARNING 3791 ext4_msg(sb, KERN_WARNING,
3742 "EXT4-fs: Quota file not on filesystem root. " 3792 "Quota file not on filesystem root. "
3743 "Journaled quota will not work.\n"); 3793 "Journaled quota will not work");
3744 } 3794 }
3745 3795
3746 /* 3796 /*
@@ -3823,8 +3873,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3823 handle_t *handle = journal_current_handle(); 3873 handle_t *handle = journal_current_handle();
3824 3874
3825 if (EXT4_SB(sb)->s_journal && !handle) { 3875 if (EXT4_SB(sb)->s_journal && !handle) {
3826 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" 3876 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
3827 " cancelled because transaction is not started.\n", 3877 " cancelled because transaction is not started",
3828 (unsigned long long)off, (unsigned long long)len); 3878 (unsigned long long)off, (unsigned long long)len);
3829 return -EIO; 3879 return -EIO;
3830 } 3880 }
@@ -3878,10 +3928,10 @@ out:
3878 3928
3879#endif 3929#endif
3880 3930
3881static int ext4_get_sb(struct file_system_type *fs_type, 3931static int ext4_get_sb(struct file_system_type *fs_type, int flags,
3882 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3932 const char *dev_name, void *data, struct vfsmount *mnt)
3883{ 3933{
3884 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3934 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3885} 3935}
3886 3936
3887static struct file_system_type ext4_fs_type = { 3937static struct file_system_type ext4_fs_type = {
@@ -3893,14 +3943,14 @@ static struct file_system_type ext4_fs_type = {
3893}; 3943};
3894 3944
3895#ifdef CONFIG_EXT4DEV_COMPAT 3945#ifdef CONFIG_EXT4DEV_COMPAT
3896static int ext4dev_get_sb(struct file_system_type *fs_type, 3946static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
3897 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3947 const char *dev_name, void *data,struct vfsmount *mnt)
3898{ 3948{
3899 printk(KERN_WARNING "EXT4-fs: Update your userspace programs " 3949 printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
3900 "to mount using ext4\n"); 3950 "to mount using ext4\n", dev_name);
3901 printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility " 3951 printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
3902 "will go away by 2.6.31\n"); 3952 "will go away by 2.6.31\n", dev_name);
3903 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3953 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3904} 3954}
3905 3955
3906static struct file_system_type ext4dev_fs_type = { 3956static struct file_system_type ext4dev_fs_type = {
@@ -3917,13 +3967,16 @@ static int __init init_ext4_fs(void)
3917{ 3967{
3918 int err; 3968 int err;
3919 3969
3970 err = init_ext4_system_zone();
3971 if (err)
3972 return err;
3920 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 3973 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
3921 if (!ext4_kset) 3974 if (!ext4_kset)
3922 return -ENOMEM; 3975 goto out4;
3923 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 3976 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3924 err = init_ext4_mballoc(); 3977 err = init_ext4_mballoc();
3925 if (err) 3978 if (err)
3926 return err; 3979 goto out3;
3927 3980
3928 err = init_ext4_xattr(); 3981 err = init_ext4_xattr();
3929 if (err) 3982 if (err)
@@ -3948,6 +4001,11 @@ out1:
3948 exit_ext4_xattr(); 4001 exit_ext4_xattr();
3949out2: 4002out2:
3950 exit_ext4_mballoc(); 4003 exit_ext4_mballoc();
4004out3:
4005 remove_proc_entry("fs/ext4", NULL);
4006 kset_unregister(ext4_kset);
4007out4:
4008 exit_ext4_system_zone();
3951 return err; 4009 return err;
3952} 4010}
3953 4011
@@ -3962,6 +4020,7 @@ static void __exit exit_ext4_fs(void)
3962 exit_ext4_mballoc(); 4020 exit_ext4_mballoc();
3963 remove_proc_entry("fs/ext4", NULL); 4021 remove_proc_entry("fs/ext4", NULL);
3964 kset_unregister(ext4_kset); 4022 kset_unregister(ext4_kset);
4023 exit_ext4_system_zone();
3965} 4024}
3966 4025
3967MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 4026MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 3a7f603b6982..f3500294eec5 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = {
840#ifdef CONFIG_COMPAT 840#ifdef CONFIG_COMPAT
841 .compat_ioctl = fat_compat_dir_ioctl, 841 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 842#endif
843 .fsync = file_fsync, 843 .fsync = fat_file_fsync,
844}; 844};
845 845
846static int fat_get_short_entry(struct inode *dir, loff_t *pos, 846static int fat_get_short_entry(struct inode *dir, loff_t *pos,
@@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
967 de++; 967 de++;
968 nr_slots--; 968 nr_slots--;
969 } 969 }
970 mark_buffer_dirty(bh); 970 mark_buffer_dirty_inode(bh, dir);
971 if (IS_DIRSYNC(dir)) 971 if (IS_DIRSYNC(dir))
972 err = sync_dirty_buffer(bh); 972 err = sync_dirty_buffer(bh);
973 brelse(bh); 973 brelse(bh);
@@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
1001 de--; 1001 de--;
1002 nr_slots--; 1002 nr_slots--;
1003 } 1003 }
1004 mark_buffer_dirty(bh); 1004 mark_buffer_dirty_inode(bh, dir);
1005 if (IS_DIRSYNC(dir)) 1005 if (IS_DIRSYNC(dir))
1006 err = sync_dirty_buffer(bh); 1006 err = sync_dirty_buffer(bh);
1007 brelse(bh); 1007 brelse(bh);
@@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
1051 } 1051 }
1052 memset(bhs[n]->b_data, 0, sb->s_blocksize); 1052 memset(bhs[n]->b_data, 0, sb->s_blocksize);
1053 set_buffer_uptodate(bhs[n]); 1053 set_buffer_uptodate(bhs[n]);
1054 mark_buffer_dirty(bhs[n]); 1054 mark_buffer_dirty_inode(bhs[n], dir);
1055 1055
1056 n++; 1056 n++;
1057 blknr++; 1057 blknr++;
@@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
1131 de[0].size = de[1].size = 0; 1131 de[0].size = de[1].size = 0;
1132 memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); 1132 memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
1133 set_buffer_uptodate(bhs[0]); 1133 set_buffer_uptodate(bhs[0]);
1134 mark_buffer_dirty(bhs[0]); 1134 mark_buffer_dirty_inode(bhs[0], dir);
1135 1135
1136 err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); 1136 err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
1137 if (err) 1137 if (err)
@@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
1193 slots += copy; 1193 slots += copy;
1194 size -= copy; 1194 size -= copy;
1195 set_buffer_uptodate(bhs[n]); 1195 set_buffer_uptodate(bhs[n]);
1196 mark_buffer_dirty(bhs[n]); 1196 mark_buffer_dirty_inode(bhs[n], dir);
1197 if (!size) 1197 if (!size)
1198 break; 1198 break;
1199 n++; 1199 n++;
@@ -1293,7 +1293,7 @@ found:
1293 for (i = 0; i < long_bhs; i++) { 1293 for (i = 0; i < long_bhs; i++) {
1294 int copy = min_t(int, sb->s_blocksize - offset, size); 1294 int copy = min_t(int, sb->s_blocksize - offset, size);
1295 memcpy(bhs[i]->b_data + offset, slots, copy); 1295 memcpy(bhs[i]->b_data + offset, slots, copy);
1296 mark_buffer_dirty(bhs[i]); 1296 mark_buffer_dirty_inode(bhs[i], dir);
1297 offset = 0; 1297 offset = 0;
1298 slots += copy; 1298 slots += copy;
1299 size -= copy; 1299 size -= copy;
@@ -1304,7 +1304,7 @@ found:
1304 /* Fill the short name slot. */ 1304 /* Fill the short name slot. */
1305 int copy = min_t(int, sb->s_blocksize - offset, size); 1305 int copy = min_t(int, sb->s_blocksize - offset, size);
1306 memcpy(bhs[i]->b_data + offset, slots, copy); 1306 memcpy(bhs[i]->b_data + offset, slots, copy);
1307 mark_buffer_dirty(bhs[i]); 1307 mark_buffer_dirty_inode(bhs[i], dir);
1308 if (IS_DIRSYNC(dir)) 1308 if (IS_DIRSYNC(dir))
1309 err = sync_dirty_buffer(bhs[i]); 1309 err = sync_dirty_buffer(bhs[i]);
1310 } 1310 }
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index ea440d65819c..e4d88527b5dd 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -74,6 +74,7 @@ struct msdos_sb_info {
74 74
75 int fatent_shift; 75 int fatent_shift;
76 struct fatent_operations *fatent_ops; 76 struct fatent_operations *fatent_ops;
77 struct inode *fat_inode;
77 78
78 spinlock_t inode_hash_lock; 79 spinlock_t inode_hash_lock;
79 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 80 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
@@ -251,6 +252,7 @@ struct fat_entry {
251 } u; 252 } u;
252 int nr_bhs; 253 int nr_bhs;
253 struct buffer_head *bhs[2]; 254 struct buffer_head *bhs[2];
255 struct inode *fat_inode;
254}; 256};
255 257
256static inline void fatent_init(struct fat_entry *fatent) 258static inline void fatent_init(struct fat_entry *fatent)
@@ -259,6 +261,7 @@ static inline void fatent_init(struct fat_entry *fatent)
259 fatent->entry = 0; 261 fatent->entry = 0;
260 fatent->u.ent32_p = NULL; 262 fatent->u.ent32_p = NULL;
261 fatent->bhs[0] = fatent->bhs[1] = NULL; 263 fatent->bhs[0] = fatent->bhs[1] = NULL;
264 fatent->fat_inode = NULL;
262} 265}
263 266
264static inline void fatent_set_entry(struct fat_entry *fatent, int entry) 267static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
@@ -275,6 +278,7 @@ static inline void fatent_brelse(struct fat_entry *fatent)
275 brelse(fatent->bhs[i]); 278 brelse(fatent->bhs[i]);
276 fatent->nr_bhs = 0; 279 fatent->nr_bhs = 0;
277 fatent->bhs[0] = fatent->bhs[1] = NULL; 280 fatent->bhs[0] = fatent->bhs[1] = NULL;
281 fatent->fat_inode = NULL;
278} 282}
279 283
280extern void fat_ent_access_init(struct super_block *sb); 284extern void fat_ent_access_init(struct super_block *sb);
@@ -296,6 +300,8 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
296extern void fat_truncate(struct inode *inode); 300extern void fat_truncate(struct inode *inode);
297extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, 301extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
298 struct kstat *stat); 302 struct kstat *stat);
303extern int fat_file_fsync(struct file *file, struct dentry *dentry,
304 int datasync);
299 305
300/* fat/inode.c */ 306/* fat/inode.c */
301extern void fat_attach(struct inode *inode, loff_t i_pos); 307extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index da6eea47872f..618f5305c2e4 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -73,6 +73,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
73 struct buffer_head **bhs = fatent->bhs; 73 struct buffer_head **bhs = fatent->bhs;
74 74
75 WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); 75 WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
76 fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
77
76 bhs[0] = sb_bread(sb, blocknr); 78 bhs[0] = sb_bread(sb, blocknr);
77 if (!bhs[0]) 79 if (!bhs[0])
78 goto err; 80 goto err;
@@ -103,6 +105,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
103 struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; 105 struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
104 106
105 WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); 107 WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
108 fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
106 fatent->bhs[0] = sb_bread(sb, blocknr); 109 fatent->bhs[0] = sb_bread(sb, blocknr);
107 if (!fatent->bhs[0]) { 110 if (!fatent->bhs[0]) {
108 printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", 111 printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
@@ -167,9 +170,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
167 } 170 }
168 spin_unlock(&fat12_entry_lock); 171 spin_unlock(&fat12_entry_lock);
169 172
170 mark_buffer_dirty(fatent->bhs[0]); 173 mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
171 if (fatent->nr_bhs == 2) 174 if (fatent->nr_bhs == 2)
172 mark_buffer_dirty(fatent->bhs[1]); 175 mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
173} 176}
174 177
175static void fat16_ent_put(struct fat_entry *fatent, int new) 178static void fat16_ent_put(struct fat_entry *fatent, int new)
@@ -178,7 +181,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
178 new = EOF_FAT16; 181 new = EOF_FAT16;
179 182
180 *fatent->u.ent16_p = cpu_to_le16(new); 183 *fatent->u.ent16_p = cpu_to_le16(new);
181 mark_buffer_dirty(fatent->bhs[0]); 184 mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
182} 185}
183 186
184static void fat32_ent_put(struct fat_entry *fatent, int new) 187static void fat32_ent_put(struct fat_entry *fatent, int new)
@@ -189,7 +192,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new)
189 WARN_ON(new & 0xf0000000); 192 WARN_ON(new & 0xf0000000);
190 new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; 193 new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
191 *fatent->u.ent32_p = cpu_to_le32(new); 194 *fatent->u.ent32_p = cpu_to_le32(new);
192 mark_buffer_dirty(fatent->bhs[0]); 195 mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
193} 196}
194 197
195static int fat12_ent_next(struct fat_entry *fatent) 198static int fat12_ent_next(struct fat_entry *fatent)
@@ -381,7 +384,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
381 } 384 }
382 memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); 385 memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
383 set_buffer_uptodate(c_bh); 386 set_buffer_uptodate(c_bh);
384 mark_buffer_dirty(c_bh); 387 mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
385 if (sb->s_flags & MS_SYNCHRONOUS) 388 if (sb->s_flags & MS_SYNCHRONOUS)
386 err = sync_dirty_buffer(c_bh); 389 err = sync_dirty_buffer(c_bh);
387 brelse(c_bh); 390 brelse(c_bh);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 0a7f4a9918b3..e955a56b4e5e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -133,6 +133,18 @@ static int fat_file_release(struct inode *inode, struct file *filp)
133 return 0; 133 return 0;
134} 134}
135 135
136int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
137{
138 struct inode *inode = dentry->d_inode;
139 int res, err;
140
141 res = simple_fsync(filp, dentry, datasync);
142 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
143
144 return res ? res : err;
145}
146
147
136const struct file_operations fat_file_operations = { 148const struct file_operations fat_file_operations = {
137 .llseek = generic_file_llseek, 149 .llseek = generic_file_llseek,
138 .read = do_sync_read, 150 .read = do_sync_read,
@@ -142,7 +154,7 @@ const struct file_operations fat_file_operations = {
142 .mmap = generic_file_mmap, 154 .mmap = generic_file_mmap,
143 .release = fat_file_release, 155 .release = fat_file_release,
144 .ioctl = fat_generic_ioctl, 156 .ioctl = fat_generic_ioctl,
145 .fsync = file_fsync, 157 .fsync = fat_file_fsync,
146 .splice_read = generic_file_splice_read, 158 .splice_read = generic_file_splice_read,
147}; 159};
148 160
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 296785a0dec8..51a5ecf9000a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -441,16 +441,35 @@ static void fat_clear_inode(struct inode *inode)
441 441
442static void fat_write_super(struct super_block *sb) 442static void fat_write_super(struct super_block *sb)
443{ 443{
444 lock_super(sb);
444 sb->s_dirt = 0; 445 sb->s_dirt = 0;
445 446
446 if (!(sb->s_flags & MS_RDONLY)) 447 if (!(sb->s_flags & MS_RDONLY))
447 fat_clusters_flush(sb); 448 fat_clusters_flush(sb);
449 unlock_super(sb);
450}
451
452static int fat_sync_fs(struct super_block *sb, int wait)
453{
454 lock_super(sb);
455 fat_clusters_flush(sb);
456 sb->s_dirt = 0;
457 unlock_super(sb);
458
459 return 0;
448} 460}
449 461
450static void fat_put_super(struct super_block *sb) 462static void fat_put_super(struct super_block *sb)
451{ 463{
452 struct msdos_sb_info *sbi = MSDOS_SB(sb); 464 struct msdos_sb_info *sbi = MSDOS_SB(sb);
453 465
466 lock_kernel();
467
468 if (sb->s_dirt)
469 fat_write_super(sb);
470
471 iput(sbi->fat_inode);
472
454 if (sbi->nls_disk) { 473 if (sbi->nls_disk) {
455 unload_nls(sbi->nls_disk); 474 unload_nls(sbi->nls_disk);
456 sbi->nls_disk = NULL; 475 sbi->nls_disk = NULL;
@@ -467,6 +486,8 @@ static void fat_put_super(struct super_block *sb)
467 486
468 sb->s_fs_info = NULL; 487 sb->s_fs_info = NULL;
469 kfree(sbi); 488 kfree(sbi);
489
490 unlock_kernel();
470} 491}
471 492
472static struct kmem_cache *fat_inode_cachep; 493static struct kmem_cache *fat_inode_cachep;
@@ -632,6 +653,7 @@ static const struct super_operations fat_sops = {
632 .delete_inode = fat_delete_inode, 653 .delete_inode = fat_delete_inode,
633 .put_super = fat_put_super, 654 .put_super = fat_put_super,
634 .write_super = fat_write_super, 655 .write_super = fat_write_super,
656 .sync_fs = fat_sync_fs,
635 .statfs = fat_statfs, 657 .statfs = fat_statfs,
636 .clear_inode = fat_clear_inode, 658 .clear_inode = fat_clear_inode,
637 .remount_fs = fat_remount, 659 .remount_fs = fat_remount,
@@ -1174,7 +1196,7 @@ static int fat_read_root(struct inode *inode)
1174int fat_fill_super(struct super_block *sb, void *data, int silent, 1196int fat_fill_super(struct super_block *sb, void *data, int silent,
1175 const struct inode_operations *fs_dir_inode_ops, int isvfat) 1197 const struct inode_operations *fs_dir_inode_ops, int isvfat)
1176{ 1198{
1177 struct inode *root_inode = NULL; 1199 struct inode *root_inode = NULL, *fat_inode = NULL;
1178 struct buffer_head *bh; 1200 struct buffer_head *bh;
1179 struct fat_boot_sector *b; 1201 struct fat_boot_sector *b;
1180 struct msdos_sb_info *sbi; 1202 struct msdos_sb_info *sbi;
@@ -1414,6 +1436,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1414 } 1436 }
1415 1437
1416 error = -ENOMEM; 1438 error = -ENOMEM;
1439 fat_inode = new_inode(sb);
1440 if (!fat_inode)
1441 goto out_fail;
1442 MSDOS_I(fat_inode)->i_pos = 0;
1443 sbi->fat_inode = fat_inode;
1417 root_inode = new_inode(sb); 1444 root_inode = new_inode(sb);
1418 if (!root_inode) 1445 if (!root_inode)
1419 goto out_fail; 1446 goto out_fail;
@@ -1439,6 +1466,8 @@ out_invalid:
1439 " on dev %s.\n", sb->s_id); 1466 " on dev %s.\n", sb->s_id);
1440 1467
1441out_fail: 1468out_fail:
1469 if (fat_inode)
1470 iput(fat_inode);
1442 if (root_inode) 1471 if (root_inode)
1443 iput(root_inode); 1472 iput(root_inode);
1444 if (sbi->nls_io) 1473 if (sbi->nls_io)
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index da3f361a37dd..20f522861355 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -544,7 +544,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
544 int start = MSDOS_I(new_dir)->i_logstart; 544 int start = MSDOS_I(new_dir)->i_logstart;
545 dotdot_de->start = cpu_to_le16(start); 545 dotdot_de->start = cpu_to_le16(start);
546 dotdot_de->starthi = cpu_to_le16(start >> 16); 546 dotdot_de->starthi = cpu_to_le16(start >> 16);
547 mark_buffer_dirty(dotdot_bh); 547 mark_buffer_dirty_inode(dotdot_bh, old_inode);
548 if (IS_DIRSYNC(new_dir)) { 548 if (IS_DIRSYNC(new_dir)) {
549 err = sync_dirty_buffer(dotdot_bh); 549 err = sync_dirty_buffer(dotdot_bh);
550 if (err) 550 if (err)
@@ -586,7 +586,7 @@ error_dotdot:
586 int start = MSDOS_I(old_dir)->i_logstart; 586 int start = MSDOS_I(old_dir)->i_logstart;
587 dotdot_de->start = cpu_to_le16(start); 587 dotdot_de->start = cpu_to_le16(start);
588 dotdot_de->starthi = cpu_to_le16(start >> 16); 588 dotdot_de->starthi = cpu_to_le16(start >> 16);
589 mark_buffer_dirty(dotdot_bh); 589 mark_buffer_dirty_inode(dotdot_bh, old_inode);
590 corrupt |= sync_dirty_buffer(dotdot_bh); 590 corrupt |= sync_dirty_buffer(dotdot_bh);
591 } 591 }
592error_inode: 592error_inode:
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a0e00e3a46e9..b50ecbe97f83 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -965,7 +965,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
965 int start = MSDOS_I(new_dir)->i_logstart; 965 int start = MSDOS_I(new_dir)->i_logstart;
966 dotdot_de->start = cpu_to_le16(start); 966 dotdot_de->start = cpu_to_le16(start);
967 dotdot_de->starthi = cpu_to_le16(start >> 16); 967 dotdot_de->starthi = cpu_to_le16(start >> 16);
968 mark_buffer_dirty(dotdot_bh); 968 mark_buffer_dirty_inode(dotdot_bh, old_inode);
969 if (IS_DIRSYNC(new_dir)) { 969 if (IS_DIRSYNC(new_dir)) {
970 err = sync_dirty_buffer(dotdot_bh); 970 err = sync_dirty_buffer(dotdot_bh);
971 if (err) 971 if (err)
@@ -1009,7 +1009,7 @@ error_dotdot:
1009 int start = MSDOS_I(old_dir)->i_logstart; 1009 int start = MSDOS_I(old_dir)->i_logstart;
1010 dotdot_de->start = cpu_to_le16(start); 1010 dotdot_de->start = cpu_to_le16(start);
1011 dotdot_de->starthi = cpu_to_le16(start >> 16); 1011 dotdot_de->starthi = cpu_to_le16(start >> 16);
1012 mark_buffer_dirty(dotdot_bh); 1012 mark_buffer_dirty_inode(dotdot_bh, old_inode);
1013 corrupt |= sync_dirty_buffer(dotdot_bh); 1013 corrupt |= sync_dirty_buffer(dotdot_bh);
1014 } 1014 }
1015error_inode: 1015error_inode:
diff --git a/fs/file_table.c b/fs/file_table.c
index 54018fe48840..334ce39881f8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
214 */ 214 */
215 if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { 215 if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
216 file_take_write(file); 216 file_take_write(file);
217 error = mnt_want_write(mnt); 217 error = mnt_clone_write(mnt);
218 WARN_ON(error); 218 WARN_ON(error);
219 } 219 }
220 return error; 220 return error;
@@ -399,6 +399,44 @@ too_bad:
399 return 0; 399 return 0;
400} 400}
401 401
402/**
403 * mark_files_ro - mark all files read-only
404 * @sb: superblock in question
405 *
406 * All files are marked read-only. We don't care about pending
407 * delete files so this should be used in 'force' mode only.
408 */
409void mark_files_ro(struct super_block *sb)
410{
411 struct file *f;
412
413retry:
414 file_list_lock();
415 list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
416 struct vfsmount *mnt;
417 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
418 continue;
419 if (!file_count(f))
420 continue;
421 if (!(f->f_mode & FMODE_WRITE))
422 continue;
423 f->f_mode &= ~FMODE_WRITE;
424 if (file_check_writeable(f) != 0)
425 continue;
426 file_release_write(f);
427 mnt = mntget(f->f_path.mnt);
428 file_list_unlock();
429 /*
430 * This can sleep, so we can't hold
431 * the file_list_lock() spinlock.
432 */
433 mnt_drop_write(mnt);
434 mntput(mnt);
435 goto retry;
436 }
437 file_list_unlock();
438}
439
402void __init files_init(unsigned long mempages) 440void __init files_init(unsigned long mempages)
403{ 441{
404 int n; 442 int n;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1dacda831577..cdbd1654e4cd 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -80,12 +80,16 @@ vxfs_put_super(struct super_block *sbp)
80{ 80{
81 struct vxfs_sb_info *infp = VXFS_SBI(sbp); 81 struct vxfs_sb_info *infp = VXFS_SBI(sbp);
82 82
83 lock_kernel();
84
83 vxfs_put_fake_inode(infp->vsi_fship); 85 vxfs_put_fake_inode(infp->vsi_fship);
84 vxfs_put_fake_inode(infp->vsi_ilist); 86 vxfs_put_fake_inode(infp->vsi_ilist);
85 vxfs_put_fake_inode(infp->vsi_stilist); 87 vxfs_put_fake_inode(infp->vsi_stilist);
86 88
87 brelse(infp->vsi_bp); 89 brelse(infp->vsi_bp);
88 kfree(infp); 90 kfree(infp);
91
92 unlock_kernel();
89} 93}
90 94
91/** 95/**
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91013ff7dd53..40308e98c6a4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -64,6 +64,28 @@ static void writeback_release(struct backing_dev_info *bdi)
64 clear_bit(BDI_pdflush, &bdi->state); 64 clear_bit(BDI_pdflush, &bdi->state);
65} 65}
66 66
67static noinline void block_dump___mark_inode_dirty(struct inode *inode)
68{
69 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
70 struct dentry *dentry;
71 const char *name = "?";
72
73 dentry = d_find_alias(inode);
74 if (dentry) {
75 spin_lock(&dentry->d_lock);
76 name = (const char *) dentry->d_name.name;
77 }
78 printk(KERN_DEBUG
79 "%s(%d): dirtied inode %lu (%s) on %s\n",
80 current->comm, task_pid_nr(current), inode->i_ino,
81 name, inode->i_sb->s_id);
82 if (dentry) {
83 spin_unlock(&dentry->d_lock);
84 dput(dentry);
85 }
86 }
87}
88
67/** 89/**
68 * __mark_inode_dirty - internal function 90 * __mark_inode_dirty - internal function
69 * @inode: inode to mark 91 * @inode: inode to mark
@@ -114,23 +136,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
114 if ((inode->i_state & flags) == flags) 136 if ((inode->i_state & flags) == flags)
115 return; 137 return;
116 138
117 if (unlikely(block_dump)) { 139 if (unlikely(block_dump))
118 struct dentry *dentry = NULL; 140 block_dump___mark_inode_dirty(inode);
119 const char *name = "?";
120
121 if (!list_empty(&inode->i_dentry)) {
122 dentry = list_entry(inode->i_dentry.next,
123 struct dentry, d_alias);
124 if (dentry && dentry->d_name.name)
125 name = (const char *) dentry->d_name.name;
126 }
127
128 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev"))
129 printk(KERN_DEBUG
130 "%s(%d): dirtied inode %lu (%s) on %s\n",
131 current->comm, task_pid_nr(current), inode->i_ino,
132 name, inode->i_sb->s_id);
133 }
134 141
135 spin_lock(&inode_lock); 142 spin_lock(&inode_lock);
136 if ((inode->i_state & flags) != flags) { 143 if ((inode->i_state & flags) != flags) {
@@ -289,7 +296,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
289 int ret; 296 int ret;
290 297
291 BUG_ON(inode->i_state & I_SYNC); 298 BUG_ON(inode->i_state & I_SYNC);
292 WARN_ON(inode->i_state & I_NEW);
293 299
294 /* Set I_SYNC, reset I_DIRTY */ 300 /* Set I_SYNC, reset I_DIRTY */
295 dirty = inode->i_state & I_DIRTY; 301 dirty = inode->i_state & I_DIRTY;
@@ -314,7 +320,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
314 } 320 }
315 321
316 spin_lock(&inode_lock); 322 spin_lock(&inode_lock);
317 WARN_ON(inode->i_state & I_NEW);
318 inode->i_state &= ~I_SYNC; 323 inode->i_state &= ~I_SYNC;
319 if (!(inode->i_state & I_FREEING)) { 324 if (!(inode->i_state & I_FREEING)) {
320 if (!(inode->i_state & I_DIRTY) && 325 if (!(inode->i_state & I_DIRTY) &&
@@ -679,55 +684,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
679} 684}
680 685
681/** 686/**
682 * sync_inodes - writes all inodes to disk
683 * @wait: wait for completion
684 *
685 * sync_inodes() goes through each super block's dirty inode list, writes the
686 * inodes out, waits on the writeout and puts the inodes back on the normal
687 * list.
688 *
689 * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle
690 * part of the sync functions is that the blockdev "superblock" is processed
691 * last. This is because the write_inode() function of a typical fs will
692 * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
693 * What we want to do is to perform all that dirtying first, and then write
694 * back all those inode blocks via the blockdev mapping in one sweep. So the
695 * additional (somewhat redundant) sync_blockdev() calls here are to make
696 * sure that really happens. Because if we call sync_inodes_sb(wait=1) with
697 * outstanding dirty inodes, the writeback goes block-at-a-time within the
698 * filesystem's write_inode(). This is extremely slow.
699 */
700static void __sync_inodes(int wait)
701{
702 struct super_block *sb;
703
704 spin_lock(&sb_lock);
705restart:
706 list_for_each_entry(sb, &super_blocks, s_list) {
707 sb->s_count++;
708 spin_unlock(&sb_lock);
709 down_read(&sb->s_umount);
710 if (sb->s_root) {
711 sync_inodes_sb(sb, wait);
712 sync_blockdev(sb->s_bdev);
713 }
714 up_read(&sb->s_umount);
715 spin_lock(&sb_lock);
716 if (__put_super_and_need_restart(sb))
717 goto restart;
718 }
719 spin_unlock(&sb_lock);
720}
721
722void sync_inodes(int wait)
723{
724 __sync_inodes(0);
725
726 if (wait)
727 __sync_inodes(1);
728}
729
730/**
731 * write_inode_now - write an inode to disk 687 * write_inode_now - write an inode to disk
732 * @inode: inode to write to disk 688 * @inode: inode to write to disk
733 * @sync: whether the write should be synchronous or not 689 * @sync: whether the write should be synchronous or not
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 72437065f6ad..e95eeb445e58 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -3,5 +3,6 @@
3# 3#
4 4
5obj-$(CONFIG_FUSE_FS) += fuse.o 5obj-$(CONFIG_FUSE_FS) += fuse.o
6obj-$(CONFIG_CUSE) += cuse.o
6 7
7fuse-objs := dev.o dir.o file.o inode.o control.o 8fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 000000000000..de792dcf3274
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,610 @@
1/*
2 * CUSE: Character device in Userspace
3 *
4 * Copyright (C) 2008-2009 SUSE Linux Products GmbH
5 * Copyright (C) 2008-2009 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * CUSE enables character devices to be implemented from userland much
10 * like FUSE allows filesystems. On initialization /dev/cuse is
11 * created. By opening the file and replying to the CUSE_INIT request
12 * userland CUSE server can create a character device. After that the
13 * operation is very similar to FUSE.
14 *
15 * A CUSE instance involves the following objects.
16 *
17 * cuse_conn : contains fuse_conn and serves as bonding structure
18 * channel : file handle connected to the userland CUSE server
19 * cdev : the implemented character device
20 * dev : generic device for cdev
21 *
22 * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with
23 * devices, it's called 'channel' to reduce confusion.
24 *
25 * channel determines when the character device dies. When channel is
26 * closed, everything begins to destruct. The cuse_conn is taken off
27 * the lookup table preventing further access from cdev, cdev and
28 * generic device are removed and the base reference of cuse_conn is
29 * put.
30 *
31 * On each open, the matching cuse_conn is looked up and if found an
32 * additional reference is taken which is released when the file is
33 * closed.
34 */
35
36#include <linux/fuse.h>
37#include <linux/cdev.h>
38#include <linux/device.h>
39#include <linux/file.h>
40#include <linux/fs.h>
41#include <linux/kdev_t.h>
42#include <linux/kthread.h>
43#include <linux/list.h>
44#include <linux/magic.h>
45#include <linux/miscdevice.h>
46#include <linux/mutex.h>
47#include <linux/spinlock.h>
48#include <linux/stat.h>
49
50#include "fuse_i.h"
51
52#define CUSE_CONNTBL_LEN 64
53
54struct cuse_conn {
55 struct list_head list; /* linked on cuse_conntbl */
56 struct fuse_conn fc; /* fuse connection */
57 struct cdev *cdev; /* associated character device */
58 struct device *dev; /* device representing @cdev */
59
60 /* init parameters, set once during initialization */
61 bool unrestricted_ioctl;
62};
63
64static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */
65static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
66static struct class *cuse_class;
67
68static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
69{
70 return container_of(fc, struct cuse_conn, fc);
71}
72
73static struct list_head *cuse_conntbl_head(dev_t devt)
74{
75 return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
76}
77
78
79/**************************************************************************
80 * CUSE frontend operations
81 *
82 * These are file operations for the character device.
83 *
84 * On open, CUSE opens a file from the FUSE mnt and stores it to
85 * private_data of the open file. All other ops call FUSE ops on the
86 * FUSE file.
87 */
88
89static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
90 loff_t *ppos)
91{
92 loff_t pos = 0;
93
94 return fuse_direct_io(file, buf, count, &pos, 0);
95}
96
97static ssize_t cuse_write(struct file *file, const char __user *buf,
98 size_t count, loff_t *ppos)
99{
100 loff_t pos = 0;
101 /*
102 * No locking or generic_write_checks(), the server is
103 * responsible for locking and sanity checks.
104 */
105 return fuse_direct_io(file, buf, count, &pos, 1);
106}
107
108static int cuse_open(struct inode *inode, struct file *file)
109{
110 dev_t devt = inode->i_cdev->dev;
111 struct cuse_conn *cc = NULL, *pos;
112 int rc;
113
114 /* look up and get the connection */
115 spin_lock(&cuse_lock);
116 list_for_each_entry(pos, cuse_conntbl_head(devt), list)
117 if (pos->dev->devt == devt) {
118 fuse_conn_get(&pos->fc);
119 cc = pos;
120 break;
121 }
122 spin_unlock(&cuse_lock);
123
124 /* dead? */
125 if (!cc)
126 return -ENODEV;
127
128 /*
129 * Generic permission check is already done against the chrdev
130 * file, proceed to open.
131 */
132 rc = fuse_do_open(&cc->fc, 0, file, 0);
133 if (rc)
134 fuse_conn_put(&cc->fc);
135 return rc;
136}
137
138static int cuse_release(struct inode *inode, struct file *file)
139{
140 struct fuse_file *ff = file->private_data;
141 struct fuse_conn *fc = ff->fc;
142
143 fuse_sync_release(ff, file->f_flags);
144 fuse_conn_put(fc);
145
146 return 0;
147}
148
149static long cuse_file_ioctl(struct file *file, unsigned int cmd,
150 unsigned long arg)
151{
152 struct fuse_file *ff = file->private_data;
153 struct cuse_conn *cc = fc_to_cc(ff->fc);
154 unsigned int flags = 0;
155
156 if (cc->unrestricted_ioctl)
157 flags |= FUSE_IOCTL_UNRESTRICTED;
158
159 return fuse_do_ioctl(file, cmd, arg, flags);
160}
161
162static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
163 unsigned long arg)
164{
165 struct fuse_file *ff = file->private_data;
166 struct cuse_conn *cc = fc_to_cc(ff->fc);
167 unsigned int flags = FUSE_IOCTL_COMPAT;
168
169 if (cc->unrestricted_ioctl)
170 flags |= FUSE_IOCTL_UNRESTRICTED;
171
172 return fuse_do_ioctl(file, cmd, arg, flags);
173}
174
175static const struct file_operations cuse_frontend_fops = {
176 .owner = THIS_MODULE,
177 .read = cuse_read,
178 .write = cuse_write,
179 .open = cuse_open,
180 .release = cuse_release,
181 .unlocked_ioctl = cuse_file_ioctl,
182 .compat_ioctl = cuse_file_compat_ioctl,
183 .poll = fuse_file_poll,
184};
185
186
187/**************************************************************************
188 * CUSE channel initialization and destruction
189 */
190
191struct cuse_devinfo {
192 const char *name;
193};
194
195/**
196 * cuse_parse_one - parse one key=value pair
197 * @pp: i/o parameter for the current position
198 * @end: points to one past the end of the packed string
199 * @keyp: out parameter for key
200 * @valp: out parameter for value
201 *
202 * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
203 * at @end - 1. This function parses one pair and set *@keyp to the
204 * start of the key and *@valp to the start of the value. Note that
205 * the original string is modified such that the key string is
206 * terminated with '\0'. *@pp is updated to point to the next string.
207 *
208 * RETURNS:
209 * 1 on successful parse, 0 on EOF, -errno on failure.
210 */
211static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
212{
213 char *p = *pp;
214 char *key, *val;
215
216 while (p < end && *p == '\0')
217 p++;
218 if (p == end)
219 return 0;
220
221 if (end[-1] != '\0') {
222 printk(KERN_ERR "CUSE: info not properly terminated\n");
223 return -EINVAL;
224 }
225
226 key = val = p;
227 p += strlen(p);
228
229 if (valp) {
230 strsep(&val, "=");
231 if (!val)
232 val = key + strlen(key);
233 key = strstrip(key);
234 val = strstrip(val);
235 } else
236 key = strstrip(key);
237
238 if (!strlen(key)) {
239 printk(KERN_ERR "CUSE: zero length info key specified\n");
240 return -EINVAL;
241 }
242
243 *pp = p;
244 *keyp = key;
245 if (valp)
246 *valp = val;
247
248 return 1;
249}
250
251/**
252 * cuse_parse_dev_info - parse device info
253 * @p: device info string
254 * @len: length of device info string
255 * @devinfo: out parameter for parsed device info
256 *
257 * Parse @p to extract device info and store it into @devinfo. String
258 * pointed to by @p is modified by parsing and @devinfo points into
259 * them, so @p shouldn't be freed while @devinfo is in use.
260 *
261 * RETURNS:
262 * 0 on success, -errno on failure.
263 */
264static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
265{
266 char *end = p + len;
267 char *key, *val;
268 int rc;
269
270 while (true) {
271 rc = cuse_parse_one(&p, end, &key, &val);
272 if (rc < 0)
273 return rc;
274 if (!rc)
275 break;
276 if (strcmp(key, "DEVNAME") == 0)
277 devinfo->name = val;
278 else
279 printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
280 key);
281 }
282
283 if (!devinfo->name || !strlen(devinfo->name)) {
284 printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
285 return -EINVAL;
286 }
287
288 return 0;
289}
290
291static void cuse_gendev_release(struct device *dev)
292{
293 kfree(dev);
294}
295
296/**
297 * cuse_process_init_reply - finish initializing CUSE channel
298 *
299 * This function creates the character device and sets up all the
300 * required data structures for it. Please read the comment at the
301 * top of this file for high level overview.
302 */
303static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
304{
305 struct cuse_conn *cc = fc_to_cc(fc);
306 struct cuse_init_out *arg = &req->misc.cuse_init_out;
307 struct page *page = req->pages[0];
308 struct cuse_devinfo devinfo = { };
309 struct device *dev;
310 struct cdev *cdev;
311 dev_t devt;
312 int rc;
313
314 if (req->out.h.error ||
315 arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
316 goto err;
317 }
318
319 fc->minor = arg->minor;
320 fc->max_read = max_t(unsigned, arg->max_read, 4096);
321 fc->max_write = max_t(unsigned, arg->max_write, 4096);
322
323 /* parse init reply */
324 cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
325
326 rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
327 &devinfo);
328 if (rc)
329 goto err;
330
331 /* determine and reserve devt */
332 devt = MKDEV(arg->dev_major, arg->dev_minor);
333 if (!MAJOR(devt))
334 rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
335 else
336 rc = register_chrdev_region(devt, 1, devinfo.name);
337 if (rc) {
338 printk(KERN_ERR "CUSE: failed to register chrdev region\n");
339 goto err;
340 }
341
342 /* devt determined, create device */
343 rc = -ENOMEM;
344 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
345 if (!dev)
346 goto err_region;
347
348 device_initialize(dev);
349 dev_set_uevent_suppress(dev, 1);
350 dev->class = cuse_class;
351 dev->devt = devt;
352 dev->release = cuse_gendev_release;
353 dev_set_drvdata(dev, cc);
354 dev_set_name(dev, "%s", devinfo.name);
355
356 rc = device_add(dev);
357 if (rc)
358 goto err_device;
359
360 /* register cdev */
361 rc = -ENOMEM;
362 cdev = cdev_alloc();
363 if (!cdev)
364 goto err_device;
365
366 cdev->owner = THIS_MODULE;
367 cdev->ops = &cuse_frontend_fops;
368
369 rc = cdev_add(cdev, devt, 1);
370 if (rc)
371 goto err_cdev;
372
373 cc->dev = dev;
374 cc->cdev = cdev;
375
376 /* make the device available */
377 spin_lock(&cuse_lock);
378 list_add(&cc->list, cuse_conntbl_head(devt));
379 spin_unlock(&cuse_lock);
380
381 /* announce device availability */
382 dev_set_uevent_suppress(dev, 0);
383 kobject_uevent(&dev->kobj, KOBJ_ADD);
384out:
385 __free_page(page);
386 return;
387
388err_cdev:
389 cdev_del(cdev);
390err_device:
391 put_device(dev);
392err_region:
393 unregister_chrdev_region(devt, 1);
394err:
395 fc->conn_error = 1;
396 goto out;
397}
398
399static int cuse_send_init(struct cuse_conn *cc)
400{
401 int rc;
402 struct fuse_req *req;
403 struct page *page;
404 struct fuse_conn *fc = &cc->fc;
405 struct cuse_init_in *arg;
406
407 BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
408
409 req = fuse_get_req(fc);
410 if (IS_ERR(req)) {
411 rc = PTR_ERR(req);
412 goto err;
413 }
414
415 rc = -ENOMEM;
416 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
417 if (!page)
418 goto err_put_req;
419
420 arg = &req->misc.cuse_init_in;
421 arg->major = FUSE_KERNEL_VERSION;
422 arg->minor = FUSE_KERNEL_MINOR_VERSION;
423 arg->flags |= CUSE_UNRESTRICTED_IOCTL;
424 req->in.h.opcode = CUSE_INIT;
425 req->in.numargs = 1;
426 req->in.args[0].size = sizeof(struct cuse_init_in);
427 req->in.args[0].value = arg;
428 req->out.numargs = 2;
429 req->out.args[0].size = sizeof(struct cuse_init_out);
430 req->out.args[0].value = &req->misc.cuse_init_out;
431 req->out.args[1].size = CUSE_INIT_INFO_MAX;
432 req->out.argvar = 1;
433 req->out.argpages = 1;
434 req->pages[0] = page;
435 req->num_pages = 1;
436 req->end = cuse_process_init_reply;
437 fuse_request_send_background(fc, req);
438
439 return 0;
440
441err_put_req:
442 fuse_put_request(fc, req);
443err:
444 return rc;
445}
446
447static void cuse_fc_release(struct fuse_conn *fc)
448{
449 struct cuse_conn *cc = fc_to_cc(fc);
450 kfree(cc);
451}
452
453/**
454 * cuse_channel_open - open method for /dev/cuse
455 * @inode: inode for /dev/cuse
456 * @file: file struct being opened
457 *
458 * Userland CUSE server can create a CUSE device by opening /dev/cuse
459 * and replying to the initilaization request kernel sends. This
460 * function is responsible for handling CUSE device initialization.
461 * Because the fd opened by this function is used during
462 * initialization, this function only creates cuse_conn and sends
463 * init. The rest is delegated to a kthread.
464 *
465 * RETURNS:
466 * 0 on success, -errno on failure.
467 */
468static int cuse_channel_open(struct inode *inode, struct file *file)
469{
470 struct cuse_conn *cc;
471 int rc;
472
473 /* set up cuse_conn */
474 cc = kzalloc(sizeof(*cc), GFP_KERNEL);
475 if (!cc)
476 return -ENOMEM;
477
478 fuse_conn_init(&cc->fc);
479
480 INIT_LIST_HEAD(&cc->list);
481 cc->fc.release = cuse_fc_release;
482
483 cc->fc.connected = 1;
484 cc->fc.blocked = 0;
485 rc = cuse_send_init(cc);
486 if (rc) {
487 fuse_conn_put(&cc->fc);
488 return rc;
489 }
490 file->private_data = &cc->fc; /* channel owns base reference to cc */
491
492 return 0;
493}
494
495/**
496 * cuse_channel_release - release method for /dev/cuse
497 * @inode: inode for /dev/cuse
498 * @file: file struct being closed
499 *
500 * Disconnect the channel, deregister CUSE device and initiate
501 * destruction by putting the default reference.
502 *
503 * RETURNS:
504 * 0 on success, -errno on failure.
505 */
506static int cuse_channel_release(struct inode *inode, struct file *file)
507{
508 struct cuse_conn *cc = fc_to_cc(file->private_data);
509 int rc;
510
511 /* remove from the conntbl, no more access from this point on */
512 spin_lock(&cuse_lock);
513 list_del_init(&cc->list);
514 spin_unlock(&cuse_lock);
515
516 /* remove device */
517 if (cc->dev)
518 device_unregister(cc->dev);
519 if (cc->cdev) {
520 unregister_chrdev_region(cc->cdev->dev, 1);
521 cdev_del(cc->cdev);
522 }
523
524 /* kill connection and shutdown channel */
525 fuse_conn_kill(&cc->fc);
526 rc = fuse_dev_release(inode, file); /* puts the base reference */
527
528 return rc;
529}
530
531static struct file_operations cuse_channel_fops; /* initialized during init */
532
533
534/**************************************************************************
535 * Misc stuff and module initializatiion
536 *
537 * CUSE exports the same set of attributes to sysfs as fusectl.
538 */
539
540static ssize_t cuse_class_waiting_show(struct device *dev,
541 struct device_attribute *attr, char *buf)
542{
543 struct cuse_conn *cc = dev_get_drvdata(dev);
544
545 return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
546}
547
548static ssize_t cuse_class_abort_store(struct device *dev,
549 struct device_attribute *attr,
550 const char *buf, size_t count)
551{
552 struct cuse_conn *cc = dev_get_drvdata(dev);
553
554 fuse_abort_conn(&cc->fc);
555 return count;
556}
557
558static struct device_attribute cuse_class_dev_attrs[] = {
559 __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL),
560 __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store),
561 { }
562};
563
564static struct miscdevice cuse_miscdev = {
565 .minor = MISC_DYNAMIC_MINOR,
566 .name = "cuse",
567 .fops = &cuse_channel_fops,
568};
569
570static int __init cuse_init(void)
571{
572 int i, rc;
573
574 /* init conntbl */
575 for (i = 0; i < CUSE_CONNTBL_LEN; i++)
576 INIT_LIST_HEAD(&cuse_conntbl[i]);
577
578 /* inherit and extend fuse_dev_operations */
579 cuse_channel_fops = fuse_dev_operations;
580 cuse_channel_fops.owner = THIS_MODULE;
581 cuse_channel_fops.open = cuse_channel_open;
582 cuse_channel_fops.release = cuse_channel_release;
583
584 cuse_class = class_create(THIS_MODULE, "cuse");
585 if (IS_ERR(cuse_class))
586 return PTR_ERR(cuse_class);
587
588 cuse_class->dev_attrs = cuse_class_dev_attrs;
589
590 rc = misc_register(&cuse_miscdev);
591 if (rc) {
592 class_destroy(cuse_class);
593 return rc;
594 }
595
596 return 0;
597}
598
599static void __exit cuse_exit(void)
600{
601 misc_deregister(&cuse_miscdev);
602 class_destroy(cuse_class);
603}
604
605module_init(cuse_init);
606module_exit(cuse_exit);
607
608MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
609MODULE_DESCRIPTION("Character device in Userspace");
610MODULE_LICENSE("GPL");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ba76b68c52ff..8fed2ed12f38 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -46,6 +46,7 @@ struct fuse_req *fuse_request_alloc(void)
46 fuse_request_init(req); 46 fuse_request_init(req);
47 return req; 47 return req;
48} 48}
49EXPORT_SYMBOL_GPL(fuse_request_alloc);
49 50
50struct fuse_req *fuse_request_alloc_nofs(void) 51struct fuse_req *fuse_request_alloc_nofs(void)
51{ 52{
@@ -124,6 +125,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
124 atomic_dec(&fc->num_waiting); 125 atomic_dec(&fc->num_waiting);
125 return ERR_PTR(err); 126 return ERR_PTR(err);
126} 127}
128EXPORT_SYMBOL_GPL(fuse_get_req);
127 129
128/* 130/*
129 * Return request in fuse_file->reserved_req. However that may 131 * Return request in fuse_file->reserved_req. However that may
@@ -208,6 +210,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
208 fuse_request_free(req); 210 fuse_request_free(req);
209 } 211 }
210} 212}
213EXPORT_SYMBOL_GPL(fuse_put_request);
211 214
212static unsigned len_args(unsigned numargs, struct fuse_arg *args) 215static unsigned len_args(unsigned numargs, struct fuse_arg *args)
213{ 216{
@@ -282,7 +285,7 @@ __releases(&fc->lock)
282 wake_up_all(&fc->blocked_waitq); 285 wake_up_all(&fc->blocked_waitq);
283 } 286 }
284 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 287 if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
285 fc->connected) { 288 fc->connected && fc->bdi_initialized) {
286 clear_bdi_congested(&fc->bdi, READ); 289 clear_bdi_congested(&fc->bdi, READ);
287 clear_bdi_congested(&fc->bdi, WRITE); 290 clear_bdi_congested(&fc->bdi, WRITE);
288 } 291 }
@@ -400,6 +403,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
400 } 403 }
401 spin_unlock(&fc->lock); 404 spin_unlock(&fc->lock);
402} 405}
406EXPORT_SYMBOL_GPL(fuse_request_send);
403 407
404static void fuse_request_send_nowait_locked(struct fuse_conn *fc, 408static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
405 struct fuse_req *req) 409 struct fuse_req *req)
@@ -408,7 +412,8 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
408 fc->num_background++; 412 fc->num_background++;
409 if (fc->num_background == FUSE_MAX_BACKGROUND) 413 if (fc->num_background == FUSE_MAX_BACKGROUND)
410 fc->blocked = 1; 414 fc->blocked = 1;
411 if (fc->num_background == FUSE_CONGESTION_THRESHOLD) { 415 if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
416 fc->bdi_initialized) {
412 set_bdi_congested(&fc->bdi, READ); 417 set_bdi_congested(&fc->bdi, READ);
413 set_bdi_congested(&fc->bdi, WRITE); 418 set_bdi_congested(&fc->bdi, WRITE);
414 } 419 }
@@ -439,6 +444,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
439 req->isreply = 1; 444 req->isreply = 1;
440 fuse_request_send_nowait(fc, req); 445 fuse_request_send_nowait(fc, req);
441} 446}
447EXPORT_SYMBOL_GPL(fuse_request_send_background);
442 448
443/* 449/*
444 * Called under fc->lock 450 * Called under fc->lock
@@ -1105,8 +1111,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
1105 } 1111 }
1106 spin_unlock(&fc->lock); 1112 spin_unlock(&fc->lock);
1107} 1113}
1114EXPORT_SYMBOL_GPL(fuse_abort_conn);
1108 1115
1109static int fuse_dev_release(struct inode *inode, struct file *file) 1116int fuse_dev_release(struct inode *inode, struct file *file)
1110{ 1117{
1111 struct fuse_conn *fc = fuse_get_conn(file); 1118 struct fuse_conn *fc = fuse_get_conn(file);
1112 if (fc) { 1119 if (fc) {
@@ -1120,6 +1127,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
1120 1127
1121 return 0; 1128 return 0;
1122} 1129}
1130EXPORT_SYMBOL_GPL(fuse_dev_release);
1123 1131
1124static int fuse_dev_fasync(int fd, struct file *file, int on) 1132static int fuse_dev_fasync(int fd, struct file *file, int on)
1125{ 1133{
@@ -1142,6 +1150,7 @@ const struct file_operations fuse_dev_operations = {
1142 .release = fuse_dev_release, 1150 .release = fuse_dev_release,
1143 .fasync = fuse_dev_fasync, 1151 .fasync = fuse_dev_fasync,
1144}; 1152};
1153EXPORT_SYMBOL_GPL(fuse_dev_operations);
1145 1154
1146static struct miscdevice fuse_miscdevice = { 1155static struct miscdevice fuse_miscdevice = {
1147 .minor = FUSE_MINOR, 1156 .minor = FUSE_MINOR,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8b8eebc5614b..b3089a083d30 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -362,19 +362,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
362} 362}
363 363
364/* 364/*
365 * Synchronous release for the case when something goes wrong in CREATE_OPEN
366 */
367static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
368 u64 nodeid, int flags)
369{
370 fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
371 ff->reserved_req->force = 1;
372 fuse_request_send(fc, ff->reserved_req);
373 fuse_put_request(fc, ff->reserved_req);
374 kfree(ff);
375}
376
377/*
378 * Atomic create+open operation 365 * Atomic create+open operation
379 * 366 *
380 * If the filesystem doesn't support this, then fall back to separate 367 * If the filesystem doesn't support this, then fall back to separate
@@ -445,12 +432,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
445 goto out_free_ff; 432 goto out_free_ff;
446 433
447 fuse_put_request(fc, req); 434 fuse_put_request(fc, req);
435 ff->fh = outopen.fh;
436 ff->nodeid = outentry.nodeid;
437 ff->open_flags = outopen.open_flags;
448 inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, 438 inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
449 &outentry.attr, entry_attr_timeout(&outentry), 0); 439 &outentry.attr, entry_attr_timeout(&outentry), 0);
450 if (!inode) { 440 if (!inode) {
451 flags &= ~(O_CREAT | O_EXCL | O_TRUNC); 441 flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
452 ff->fh = outopen.fh; 442 fuse_sync_release(ff, flags);
453 fuse_sync_release(fc, ff, outentry.nodeid, flags);
454 fuse_send_forget(fc, forget_req, outentry.nodeid, 1); 443 fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
455 return -ENOMEM; 444 return -ENOMEM;
456 } 445 }
@@ -460,11 +449,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
460 fuse_invalidate_attr(dir); 449 fuse_invalidate_attr(dir);
461 file = lookup_instantiate_filp(nd, entry, generic_file_open); 450 file = lookup_instantiate_filp(nd, entry, generic_file_open);
462 if (IS_ERR(file)) { 451 if (IS_ERR(file)) {
463 ff->fh = outopen.fh; 452 fuse_sync_release(ff, flags);
464 fuse_sync_release(fc, ff, outentry.nodeid, flags);
465 return PTR_ERR(file); 453 return PTR_ERR(file);
466 } 454 }
467 fuse_finish_open(inode, file, ff, &outopen); 455 file->private_data = fuse_file_get(ff);
456 fuse_finish_open(inode, file);
468 return 0; 457 return 0;
469 458
470 out_free_ff: 459 out_free_ff:
@@ -1035,7 +1024,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1035 req->out.argpages = 1; 1024 req->out.argpages = 1;
1036 req->num_pages = 1; 1025 req->num_pages = 1;
1037 req->pages[0] = page; 1026 req->pages[0] = page;
1038 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); 1027 fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
1039 fuse_request_send(fc, req); 1028 fuse_request_send(fc, req);
1040 nbytes = req->out.args[0].size; 1029 nbytes = req->out.args[0].size;
1041 err = req->out.h.error; 1030 err = req->out.h.error;
@@ -1101,12 +1090,14 @@ static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
1101 1090
1102static int fuse_dir_open(struct inode *inode, struct file *file) 1091static int fuse_dir_open(struct inode *inode, struct file *file)
1103{ 1092{
1104 return fuse_open_common(inode, file, 1); 1093 return fuse_open_common(inode, file, true);
1105} 1094}
1106 1095
1107static int fuse_dir_release(struct inode *inode, struct file *file) 1096static int fuse_dir_release(struct inode *inode, struct file *file)
1108{ 1097{
1109 return fuse_release_common(inode, file, 1); 1098 fuse_release_common(file, FUSE_RELEASEDIR);
1099
1100 return 0;
1110} 1101}
1111 1102
1112static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) 1103static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 06f30e965676..fce6ce694fde 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,13 +12,13 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/module.h>
15 16
16static const struct file_operations fuse_direct_io_file_operations; 17static const struct file_operations fuse_direct_io_file_operations;
17 18
18static int fuse_send_open(struct inode *inode, struct file *file, int isdir, 19static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
19 struct fuse_open_out *outargp) 20 int opcode, struct fuse_open_out *outargp)
20{ 21{
21 struct fuse_conn *fc = get_fuse_conn(inode);
22 struct fuse_open_in inarg; 22 struct fuse_open_in inarg;
23 struct fuse_req *req; 23 struct fuse_req *req;
24 int err; 24 int err;
@@ -31,8 +31,8 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
31 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); 31 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
32 if (!fc->atomic_o_trunc) 32 if (!fc->atomic_o_trunc)
33 inarg.flags &= ~O_TRUNC; 33 inarg.flags &= ~O_TRUNC;
34 req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 34 req->in.h.opcode = opcode;
35 req->in.h.nodeid = get_node_id(inode); 35 req->in.h.nodeid = nodeid;
36 req->in.numargs = 1; 36 req->in.numargs = 1;
37 req->in.args[0].size = sizeof(inarg); 37 req->in.args[0].size = sizeof(inarg);
38 req->in.args[0].value = &inarg; 38 req->in.args[0].value = &inarg;
@@ -49,22 +49,27 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
49struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) 49struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
50{ 50{
51 struct fuse_file *ff; 51 struct fuse_file *ff;
52
52 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); 53 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
53 if (ff) { 54 if (unlikely(!ff))
54 ff->reserved_req = fuse_request_alloc(); 55 return NULL;
55 if (!ff->reserved_req) { 56
56 kfree(ff); 57 ff->fc = fc;
57 return NULL; 58 ff->reserved_req = fuse_request_alloc();
58 } else { 59 if (unlikely(!ff->reserved_req)) {
59 INIT_LIST_HEAD(&ff->write_entry); 60 kfree(ff);
60 atomic_set(&ff->count, 0); 61 return NULL;
61 spin_lock(&fc->lock);
62 ff->kh = ++fc->khctr;
63 spin_unlock(&fc->lock);
64 }
65 RB_CLEAR_NODE(&ff->polled_node);
66 init_waitqueue_head(&ff->poll_wait);
67 } 62 }
63
64 INIT_LIST_HEAD(&ff->write_entry);
65 atomic_set(&ff->count, 0);
66 RB_CLEAR_NODE(&ff->polled_node);
67 init_waitqueue_head(&ff->poll_wait);
68
69 spin_lock(&fc->lock);
70 ff->kh = ++fc->khctr;
71 spin_unlock(&fc->lock);
72
68 return ff; 73 return ff;
69} 74}
70 75
@@ -74,7 +79,7 @@ void fuse_file_free(struct fuse_file *ff)
74 kfree(ff); 79 kfree(ff);
75} 80}
76 81
77static struct fuse_file *fuse_file_get(struct fuse_file *ff) 82struct fuse_file *fuse_file_get(struct fuse_file *ff)
78{ 83{
79 atomic_inc(&ff->count); 84 atomic_inc(&ff->count);
80 return ff; 85 return ff;
@@ -82,40 +87,65 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff)
82 87
83static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) 88static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
84{ 89{
85 dput(req->misc.release.dentry); 90 path_put(&req->misc.release.path);
86 mntput(req->misc.release.vfsmount);
87} 91}
88 92
89static void fuse_file_put(struct fuse_file *ff) 93static void fuse_file_put(struct fuse_file *ff)
90{ 94{
91 if (atomic_dec_and_test(&ff->count)) { 95 if (atomic_dec_and_test(&ff->count)) {
92 struct fuse_req *req = ff->reserved_req; 96 struct fuse_req *req = ff->reserved_req;
93 struct inode *inode = req->misc.release.dentry->d_inode; 97
94 struct fuse_conn *fc = get_fuse_conn(inode);
95 req->end = fuse_release_end; 98 req->end = fuse_release_end;
96 fuse_request_send_background(fc, req); 99 fuse_request_send_background(ff->fc, req);
97 kfree(ff); 100 kfree(ff);
98 } 101 }
99} 102}
100 103
101void fuse_finish_open(struct inode *inode, struct file *file, 104int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
102 struct fuse_file *ff, struct fuse_open_out *outarg) 105 bool isdir)
103{ 106{
104 if (outarg->open_flags & FOPEN_DIRECT_IO) 107 struct fuse_open_out outarg;
108 struct fuse_file *ff;
109 int err;
110 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
111
112 ff = fuse_file_alloc(fc);
113 if (!ff)
114 return -ENOMEM;
115
116 err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
117 if (err) {
118 fuse_file_free(ff);
119 return err;
120 }
121
122 if (isdir)
123 outarg.open_flags &= ~FOPEN_DIRECT_IO;
124
125 ff->fh = outarg.fh;
126 ff->nodeid = nodeid;
127 ff->open_flags = outarg.open_flags;
128 file->private_data = fuse_file_get(ff);
129
130 return 0;
131}
132EXPORT_SYMBOL_GPL(fuse_do_open);
133
134void fuse_finish_open(struct inode *inode, struct file *file)
135{
136 struct fuse_file *ff = file->private_data;
137
138 if (ff->open_flags & FOPEN_DIRECT_IO)
105 file->f_op = &fuse_direct_io_file_operations; 139 file->f_op = &fuse_direct_io_file_operations;
106 if (!(outarg->open_flags & FOPEN_KEEP_CACHE)) 140 if (!(ff->open_flags & FOPEN_KEEP_CACHE))
107 invalidate_inode_pages2(inode->i_mapping); 141 invalidate_inode_pages2(inode->i_mapping);
108 if (outarg->open_flags & FOPEN_NONSEEKABLE) 142 if (ff->open_flags & FOPEN_NONSEEKABLE)
109 nonseekable_open(inode, file); 143 nonseekable_open(inode, file);
110 ff->fh = outarg->fh;
111 file->private_data = fuse_file_get(ff);
112} 144}
113 145
114int fuse_open_common(struct inode *inode, struct file *file, int isdir) 146int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
115{ 147{
116 struct fuse_conn *fc = get_fuse_conn(inode); 148 struct fuse_conn *fc = get_fuse_conn(inode);
117 struct fuse_open_out outarg;
118 struct fuse_file *ff;
119 int err; 149 int err;
120 150
121 /* VFS checks this, but only _after_ ->open() */ 151 /* VFS checks this, but only _after_ ->open() */
@@ -126,78 +156,85 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
126 if (err) 156 if (err)
127 return err; 157 return err;
128 158
129 ff = fuse_file_alloc(fc); 159 err = fuse_do_open(fc, get_node_id(inode), file, isdir);
130 if (!ff)
131 return -ENOMEM;
132
133 err = fuse_send_open(inode, file, isdir, &outarg);
134 if (err) 160 if (err)
135 fuse_file_free(ff); 161 return err;
136 else {
137 if (isdir)
138 outarg.open_flags &= ~FOPEN_DIRECT_IO;
139 fuse_finish_open(inode, file, ff, &outarg);
140 }
141 162
142 return err; 163 fuse_finish_open(inode, file);
164
165 return 0;
143} 166}
144 167
145void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode) 168static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
146{ 169{
170 struct fuse_conn *fc = ff->fc;
147 struct fuse_req *req = ff->reserved_req; 171 struct fuse_req *req = ff->reserved_req;
148 struct fuse_release_in *inarg = &req->misc.release.in; 172 struct fuse_release_in *inarg = &req->misc.release.in;
149 173
174 spin_lock(&fc->lock);
175 list_del(&ff->write_entry);
176 if (!RB_EMPTY_NODE(&ff->polled_node))
177 rb_erase(&ff->polled_node, &fc->polled_files);
178 spin_unlock(&fc->lock);
179
180 wake_up_interruptible_sync(&ff->poll_wait);
181
150 inarg->fh = ff->fh; 182 inarg->fh = ff->fh;
151 inarg->flags = flags; 183 inarg->flags = flags;
152 req->in.h.opcode = opcode; 184 req->in.h.opcode = opcode;
153 req->in.h.nodeid = nodeid; 185 req->in.h.nodeid = ff->nodeid;
154 req->in.numargs = 1; 186 req->in.numargs = 1;
155 req->in.args[0].size = sizeof(struct fuse_release_in); 187 req->in.args[0].size = sizeof(struct fuse_release_in);
156 req->in.args[0].value = inarg; 188 req->in.args[0].value = inarg;
157} 189}
158 190
159int fuse_release_common(struct inode *inode, struct file *file, int isdir) 191void fuse_release_common(struct file *file, int opcode)
160{ 192{
161 struct fuse_file *ff = file->private_data; 193 struct fuse_file *ff;
162 if (ff) { 194 struct fuse_req *req;
163 struct fuse_conn *fc = get_fuse_conn(inode);
164 struct fuse_req *req = ff->reserved_req;
165
166 fuse_release_fill(ff, get_node_id(inode), file->f_flags,
167 isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
168 195
169 /* Hold vfsmount and dentry until release is finished */ 196 ff = file->private_data;
170 req->misc.release.vfsmount = mntget(file->f_path.mnt); 197 if (unlikely(!ff))
171 req->misc.release.dentry = dget(file->f_path.dentry); 198 return;
172 199
173 spin_lock(&fc->lock); 200 req = ff->reserved_req;
174 list_del(&ff->write_entry); 201 fuse_prepare_release(ff, file->f_flags, opcode);
175 if (!RB_EMPTY_NODE(&ff->polled_node))
176 rb_erase(&ff->polled_node, &fc->polled_files);
177 spin_unlock(&fc->lock);
178 202
179 wake_up_interruptible_sync(&ff->poll_wait); 203 /* Hold vfsmount and dentry until release is finished */
180 /* 204 path_get(&file->f_path);
181 * Normally this will send the RELEASE request, 205 req->misc.release.path = file->f_path;
182 * however if some asynchronous READ or WRITE requests
183 * are outstanding, the sending will be delayed
184 */
185 fuse_file_put(ff);
186 }
187 206
188 /* Return value is ignored by VFS */ 207 /*
189 return 0; 208 * Normally this will send the RELEASE request, however if
209 * some asynchronous READ or WRITE requests are outstanding,
210 * the sending will be delayed.
211 */
212 fuse_file_put(ff);
190} 213}
191 214
192static int fuse_open(struct inode *inode, struct file *file) 215static int fuse_open(struct inode *inode, struct file *file)
193{ 216{
194 return fuse_open_common(inode, file, 0); 217 return fuse_open_common(inode, file, false);
195} 218}
196 219
197static int fuse_release(struct inode *inode, struct file *file) 220static int fuse_release(struct inode *inode, struct file *file)
198{ 221{
199 return fuse_release_common(inode, file, 0); 222 fuse_release_common(file, FUSE_RELEASE);
223
224 /* return value is ignored by VFS */
225 return 0;
226}
227
228void fuse_sync_release(struct fuse_file *ff, int flags)
229{
230 WARN_ON(atomic_read(&ff->count) > 1);
231 fuse_prepare_release(ff, flags, FUSE_RELEASE);
232 ff->reserved_req->force = 1;
233 fuse_request_send(ff->fc, ff->reserved_req);
234 fuse_put_request(ff->fc, ff->reserved_req);
235 kfree(ff);
200} 236}
237EXPORT_SYMBOL_GPL(fuse_sync_release);
201 238
202/* 239/*
203 * Scramble the ID space with XTEA, so that the value of the files_struct 240 * Scramble the ID space with XTEA, so that the value of the files_struct
@@ -371,8 +408,8 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
371 return fuse_fsync_common(file, de, datasync, 0); 408 return fuse_fsync_common(file, de, datasync, 0);
372} 409}
373 410
374void fuse_read_fill(struct fuse_req *req, struct file *file, 411void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
375 struct inode *inode, loff_t pos, size_t count, int opcode) 412 size_t count, int opcode)
376{ 413{
377 struct fuse_read_in *inarg = &req->misc.read.in; 414 struct fuse_read_in *inarg = &req->misc.read.in;
378 struct fuse_file *ff = file->private_data; 415 struct fuse_file *ff = file->private_data;
@@ -382,7 +419,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
382 inarg->size = count; 419 inarg->size = count;
383 inarg->flags = file->f_flags; 420 inarg->flags = file->f_flags;
384 req->in.h.opcode = opcode; 421 req->in.h.opcode = opcode;
385 req->in.h.nodeid = get_node_id(inode); 422 req->in.h.nodeid = ff->nodeid;
386 req->in.numargs = 1; 423 req->in.numargs = 1;
387 req->in.args[0].size = sizeof(struct fuse_read_in); 424 req->in.args[0].size = sizeof(struct fuse_read_in);
388 req->in.args[0].value = inarg; 425 req->in.args[0].value = inarg;
@@ -392,12 +429,12 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
392} 429}
393 430
394static size_t fuse_send_read(struct fuse_req *req, struct file *file, 431static size_t fuse_send_read(struct fuse_req *req, struct file *file,
395 struct inode *inode, loff_t pos, size_t count, 432 loff_t pos, size_t count, fl_owner_t owner)
396 fl_owner_t owner)
397{ 433{
398 struct fuse_conn *fc = get_fuse_conn(inode); 434 struct fuse_file *ff = file->private_data;
435 struct fuse_conn *fc = ff->fc;
399 436
400 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 437 fuse_read_fill(req, file, pos, count, FUSE_READ);
401 if (owner != NULL) { 438 if (owner != NULL) {
402 struct fuse_read_in *inarg = &req->misc.read.in; 439 struct fuse_read_in *inarg = &req->misc.read.in;
403 440
@@ -455,7 +492,7 @@ static int fuse_readpage(struct file *file, struct page *page)
455 req->out.argpages = 1; 492 req->out.argpages = 1;
456 req->num_pages = 1; 493 req->num_pages = 1;
457 req->pages[0] = page; 494 req->pages[0] = page;
458 num_read = fuse_send_read(req, file, inode, pos, count, NULL); 495 num_read = fuse_send_read(req, file, pos, count, NULL);
459 err = req->out.h.error; 496 err = req->out.h.error;
460 fuse_put_request(fc, req); 497 fuse_put_request(fc, req);
461 498
@@ -504,19 +541,18 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
504 fuse_file_put(req->ff); 541 fuse_file_put(req->ff);
505} 542}
506 543
507static void fuse_send_readpages(struct fuse_req *req, struct file *file, 544static void fuse_send_readpages(struct fuse_req *req, struct file *file)
508 struct inode *inode)
509{ 545{
510 struct fuse_conn *fc = get_fuse_conn(inode); 546 struct fuse_file *ff = file->private_data;
547 struct fuse_conn *fc = ff->fc;
511 loff_t pos = page_offset(req->pages[0]); 548 loff_t pos = page_offset(req->pages[0]);
512 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 549 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
513 550
514 req->out.argpages = 1; 551 req->out.argpages = 1;
515 req->out.page_zeroing = 1; 552 req->out.page_zeroing = 1;
516 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 553 fuse_read_fill(req, file, pos, count, FUSE_READ);
517 req->misc.read.attr_ver = fuse_get_attr_version(fc); 554 req->misc.read.attr_ver = fuse_get_attr_version(fc);
518 if (fc->async_read) { 555 if (fc->async_read) {
519 struct fuse_file *ff = file->private_data;
520 req->ff = fuse_file_get(ff); 556 req->ff = fuse_file_get(ff);
521 req->end = fuse_readpages_end; 557 req->end = fuse_readpages_end;
522 fuse_request_send_background(fc, req); 558 fuse_request_send_background(fc, req);
@@ -546,7 +582,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
546 (req->num_pages == FUSE_MAX_PAGES_PER_REQ || 582 (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
547 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || 583 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
548 req->pages[req->num_pages - 1]->index + 1 != page->index)) { 584 req->pages[req->num_pages - 1]->index + 1 != page->index)) {
549 fuse_send_readpages(req, data->file, inode); 585 fuse_send_readpages(req, data->file);
550 data->req = req = fuse_get_req(fc); 586 data->req = req = fuse_get_req(fc);
551 if (IS_ERR(req)) { 587 if (IS_ERR(req)) {
552 unlock_page(page); 588 unlock_page(page);
@@ -580,7 +616,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
580 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); 616 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
581 if (!err) { 617 if (!err) {
582 if (data.req->num_pages) 618 if (data.req->num_pages)
583 fuse_send_readpages(data.req, file, inode); 619 fuse_send_readpages(data.req, file);
584 else 620 else
585 fuse_put_request(fc, data.req); 621 fuse_put_request(fc, data.req);
586 } 622 }
@@ -607,24 +643,19 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
607 return generic_file_aio_read(iocb, iov, nr_segs, pos); 643 return generic_file_aio_read(iocb, iov, nr_segs, pos);
608} 644}
609 645
610static void fuse_write_fill(struct fuse_req *req, struct file *file, 646static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
611 struct fuse_file *ff, struct inode *inode, 647 loff_t pos, size_t count)
612 loff_t pos, size_t count, int writepage)
613{ 648{
614 struct fuse_conn *fc = get_fuse_conn(inode);
615 struct fuse_write_in *inarg = &req->misc.write.in; 649 struct fuse_write_in *inarg = &req->misc.write.in;
616 struct fuse_write_out *outarg = &req->misc.write.out; 650 struct fuse_write_out *outarg = &req->misc.write.out;
617 651
618 memset(inarg, 0, sizeof(struct fuse_write_in));
619 inarg->fh = ff->fh; 652 inarg->fh = ff->fh;
620 inarg->offset = pos; 653 inarg->offset = pos;
621 inarg->size = count; 654 inarg->size = count;
622 inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
623 inarg->flags = file ? file->f_flags : 0;
624 req->in.h.opcode = FUSE_WRITE; 655 req->in.h.opcode = FUSE_WRITE;
625 req->in.h.nodeid = get_node_id(inode); 656 req->in.h.nodeid = ff->nodeid;
626 req->in.numargs = 2; 657 req->in.numargs = 2;
627 if (fc->minor < 9) 658 if (ff->fc->minor < 9)
628 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 659 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
629 else 660 else
630 req->in.args[0].size = sizeof(struct fuse_write_in); 661 req->in.args[0].size = sizeof(struct fuse_write_in);
@@ -636,13 +667,15 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
636} 667}
637 668
638static size_t fuse_send_write(struct fuse_req *req, struct file *file, 669static size_t fuse_send_write(struct fuse_req *req, struct file *file,
639 struct inode *inode, loff_t pos, size_t count, 670 loff_t pos, size_t count, fl_owner_t owner)
640 fl_owner_t owner)
641{ 671{
642 struct fuse_conn *fc = get_fuse_conn(inode); 672 struct fuse_file *ff = file->private_data;
643 fuse_write_fill(req, file, file->private_data, inode, pos, count, 0); 673 struct fuse_conn *fc = ff->fc;
674 struct fuse_write_in *inarg = &req->misc.write.in;
675
676 fuse_write_fill(req, ff, pos, count);
677 inarg->flags = file->f_flags;
644 if (owner != NULL) { 678 if (owner != NULL) {
645 struct fuse_write_in *inarg = &req->misc.write.in;
646 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 679 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
647 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 680 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
648 } 681 }
@@ -700,7 +733,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
700 req->num_pages = 1; 733 req->num_pages = 1;
701 req->pages[0] = page; 734 req->pages[0] = page;
702 req->page_offset = offset; 735 req->page_offset = offset;
703 nres = fuse_send_write(req, file, inode, pos, count, NULL); 736 nres = fuse_send_write(req, file, pos, count, NULL);
704 err = req->out.h.error; 737 err = req->out.h.error;
705 fuse_put_request(fc, req); 738 fuse_put_request(fc, req);
706 if (!err && !nres) 739 if (!err && !nres)
@@ -741,7 +774,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
741 for (i = 0; i < req->num_pages; i++) 774 for (i = 0; i < req->num_pages; i++)
742 fuse_wait_on_page_writeback(inode, req->pages[i]->index); 775 fuse_wait_on_page_writeback(inode, req->pages[i]->index);
743 776
744 res = fuse_send_write(req, file, inode, pos, count, NULL); 777 res = fuse_send_write(req, file, pos, count, NULL);
745 778
746 offset = req->page_offset; 779 offset = req->page_offset;
747 count = res; 780 count = res;
@@ -979,25 +1012,23 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
979 return 0; 1012 return 0;
980} 1013}
981 1014
982static ssize_t fuse_direct_io(struct file *file, const char __user *buf, 1015ssize_t fuse_direct_io(struct file *file, const char __user *buf,
983 size_t count, loff_t *ppos, int write) 1016 size_t count, loff_t *ppos, int write)
984{ 1017{
985 struct inode *inode = file->f_path.dentry->d_inode; 1018 struct fuse_file *ff = file->private_data;
986 struct fuse_conn *fc = get_fuse_conn(inode); 1019 struct fuse_conn *fc = ff->fc;
987 size_t nmax = write ? fc->max_write : fc->max_read; 1020 size_t nmax = write ? fc->max_write : fc->max_read;
988 loff_t pos = *ppos; 1021 loff_t pos = *ppos;
989 ssize_t res = 0; 1022 ssize_t res = 0;
990 struct fuse_req *req; 1023 struct fuse_req *req;
991 1024
992 if (is_bad_inode(inode))
993 return -EIO;
994
995 req = fuse_get_req(fc); 1025 req = fuse_get_req(fc);
996 if (IS_ERR(req)) 1026 if (IS_ERR(req))
997 return PTR_ERR(req); 1027 return PTR_ERR(req);
998 1028
999 while (count) { 1029 while (count) {
1000 size_t nres; 1030 size_t nres;
1031 fl_owner_t owner = current->files;
1001 size_t nbytes = min(count, nmax); 1032 size_t nbytes = min(count, nmax);
1002 int err = fuse_get_user_pages(req, buf, &nbytes, write); 1033 int err = fuse_get_user_pages(req, buf, &nbytes, write);
1003 if (err) { 1034 if (err) {
@@ -1006,11 +1037,10 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1006 } 1037 }
1007 1038
1008 if (write) 1039 if (write)
1009 nres = fuse_send_write(req, file, inode, pos, nbytes, 1040 nres = fuse_send_write(req, file, pos, nbytes, owner);
1010 current->files);
1011 else 1041 else
1012 nres = fuse_send_read(req, file, inode, pos, nbytes, 1042 nres = fuse_send_read(req, file, pos, nbytes, owner);
1013 current->files); 1043
1014 fuse_release_user_pages(req, !write); 1044 fuse_release_user_pages(req, !write);
1015 if (req->out.h.error) { 1045 if (req->out.h.error) {
1016 if (!res) 1046 if (!res)
@@ -1034,20 +1064,27 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1034 } 1064 }
1035 } 1065 }
1036 fuse_put_request(fc, req); 1066 fuse_put_request(fc, req);
1037 if (res > 0) { 1067 if (res > 0)
1038 if (write)
1039 fuse_write_update_size(inode, pos);
1040 *ppos = pos; 1068 *ppos = pos;
1041 }
1042 fuse_invalidate_attr(inode);
1043 1069
1044 return res; 1070 return res;
1045} 1071}
1072EXPORT_SYMBOL_GPL(fuse_direct_io);
1046 1073
1047static ssize_t fuse_direct_read(struct file *file, char __user *buf, 1074static ssize_t fuse_direct_read(struct file *file, char __user *buf,
1048 size_t count, loff_t *ppos) 1075 size_t count, loff_t *ppos)
1049{ 1076{
1050 return fuse_direct_io(file, buf, count, ppos, 0); 1077 ssize_t res;
1078 struct inode *inode = file->f_path.dentry->d_inode;
1079
1080 if (is_bad_inode(inode))
1081 return -EIO;
1082
1083 res = fuse_direct_io(file, buf, count, ppos, 0);
1084
1085 fuse_invalidate_attr(inode);
1086
1087 return res;
1051} 1088}
1052 1089
1053static ssize_t fuse_direct_write(struct file *file, const char __user *buf, 1090static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
@@ -1055,12 +1092,22 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
1055{ 1092{
1056 struct inode *inode = file->f_path.dentry->d_inode; 1093 struct inode *inode = file->f_path.dentry->d_inode;
1057 ssize_t res; 1094 ssize_t res;
1095
1096 if (is_bad_inode(inode))
1097 return -EIO;
1098
1058 /* Don't allow parallel writes to the same file */ 1099 /* Don't allow parallel writes to the same file */
1059 mutex_lock(&inode->i_mutex); 1100 mutex_lock(&inode->i_mutex);
1060 res = generic_write_checks(file, ppos, &count, 0); 1101 res = generic_write_checks(file, ppos, &count, 0);
1061 if (!res) 1102 if (!res) {
1062 res = fuse_direct_io(file, buf, count, ppos, 1); 1103 res = fuse_direct_io(file, buf, count, ppos, 1);
1104 if (res > 0)
1105 fuse_write_update_size(inode, *ppos);
1106 }
1063 mutex_unlock(&inode->i_mutex); 1107 mutex_unlock(&inode->i_mutex);
1108
1109 fuse_invalidate_attr(inode);
1110
1064 return res; 1111 return res;
1065} 1112}
1066 1113
@@ -1177,9 +1224,10 @@ static int fuse_writepage_locked(struct page *page)
1177 req->ff = fuse_file_get(ff); 1224 req->ff = fuse_file_get(ff);
1178 spin_unlock(&fc->lock); 1225 spin_unlock(&fc->lock);
1179 1226
1180 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); 1227 fuse_write_fill(req, ff, page_offset(page), 0);
1181 1228
1182 copy_highpage(tmp_page, page); 1229 copy_highpage(tmp_page, page);
1230 req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
1183 req->in.argpages = 1; 1231 req->in.argpages = 1;
1184 req->num_pages = 1; 1232 req->num_pages = 1;
1185 req->pages[0] = tmp_page; 1233 req->pages[0] = tmp_page;
@@ -1603,12 +1651,11 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1603 * limits ioctl data transfers to well-formed ioctls and is the forced 1651 * limits ioctl data transfers to well-formed ioctls and is the forced
1604 * behavior for all FUSE servers. 1652 * behavior for all FUSE servers.
1605 */ 1653 */
1606static long fuse_file_do_ioctl(struct file *file, unsigned int cmd, 1654long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1607 unsigned long arg, unsigned int flags) 1655 unsigned int flags)
1608{ 1656{
1609 struct inode *inode = file->f_dentry->d_inode;
1610 struct fuse_file *ff = file->private_data; 1657 struct fuse_file *ff = file->private_data;
1611 struct fuse_conn *fc = get_fuse_conn(inode); 1658 struct fuse_conn *fc = ff->fc;
1612 struct fuse_ioctl_in inarg = { 1659 struct fuse_ioctl_in inarg = {
1613 .fh = ff->fh, 1660 .fh = ff->fh,
1614 .cmd = cmd, 1661 .cmd = cmd,
@@ -1627,13 +1674,6 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1627 /* assume all the iovs returned by client always fits in a page */ 1674 /* assume all the iovs returned by client always fits in a page */
1628 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 1675 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1629 1676
1630 if (!fuse_allow_task(fc, current))
1631 return -EACCES;
1632
1633 err = -EIO;
1634 if (is_bad_inode(inode))
1635 goto out;
1636
1637 err = -ENOMEM; 1677 err = -ENOMEM;
1638 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); 1678 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1639 iov_page = alloc_page(GFP_KERNEL); 1679 iov_page = alloc_page(GFP_KERNEL);
@@ -1694,7 +1734,7 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1694 1734
1695 /* okay, let's send it to the client */ 1735 /* okay, let's send it to the client */
1696 req->in.h.opcode = FUSE_IOCTL; 1736 req->in.h.opcode = FUSE_IOCTL;
1697 req->in.h.nodeid = get_node_id(inode); 1737 req->in.h.nodeid = ff->nodeid;
1698 req->in.numargs = 1; 1738 req->in.numargs = 1;
1699 req->in.args[0].size = sizeof(inarg); 1739 req->in.args[0].size = sizeof(inarg);
1700 req->in.args[0].value = &inarg; 1740 req->in.args[0].value = &inarg;
@@ -1777,17 +1817,33 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1777 1817
1778 return err ? err : outarg.result; 1818 return err ? err : outarg.result;
1779} 1819}
1820EXPORT_SYMBOL_GPL(fuse_do_ioctl);
1821
1822static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
1823 unsigned long arg, unsigned int flags)
1824{
1825 struct inode *inode = file->f_dentry->d_inode;
1826 struct fuse_conn *fc = get_fuse_conn(inode);
1827
1828 if (!fuse_allow_task(fc, current))
1829 return -EACCES;
1830
1831 if (is_bad_inode(inode))
1832 return -EIO;
1833
1834 return fuse_do_ioctl(file, cmd, arg, flags);
1835}
1780 1836
1781static long fuse_file_ioctl(struct file *file, unsigned int cmd, 1837static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1782 unsigned long arg) 1838 unsigned long arg)
1783{ 1839{
1784 return fuse_file_do_ioctl(file, cmd, arg, 0); 1840 return fuse_file_ioctl_common(file, cmd, arg, 0);
1785} 1841}
1786 1842
1787static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, 1843static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
1788 unsigned long arg) 1844 unsigned long arg)
1789{ 1845{
1790 return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT); 1846 return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
1791} 1847}
1792 1848
1793/* 1849/*
@@ -1841,11 +1897,10 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
1841 spin_unlock(&fc->lock); 1897 spin_unlock(&fc->lock);
1842} 1898}
1843 1899
1844static unsigned fuse_file_poll(struct file *file, poll_table *wait) 1900unsigned fuse_file_poll(struct file *file, poll_table *wait)
1845{ 1901{
1846 struct inode *inode = file->f_dentry->d_inode;
1847 struct fuse_file *ff = file->private_data; 1902 struct fuse_file *ff = file->private_data;
1848 struct fuse_conn *fc = get_fuse_conn(inode); 1903 struct fuse_conn *fc = ff->fc;
1849 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; 1904 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
1850 struct fuse_poll_out outarg; 1905 struct fuse_poll_out outarg;
1851 struct fuse_req *req; 1906 struct fuse_req *req;
@@ -1870,7 +1925,7 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
1870 return PTR_ERR(req); 1925 return PTR_ERR(req);
1871 1926
1872 req->in.h.opcode = FUSE_POLL; 1927 req->in.h.opcode = FUSE_POLL;
1873 req->in.h.nodeid = get_node_id(inode); 1928 req->in.h.nodeid = ff->nodeid;
1874 req->in.numargs = 1; 1929 req->in.numargs = 1;
1875 req->in.args[0].size = sizeof(inarg); 1930 req->in.args[0].size = sizeof(inarg);
1876 req->in.args[0].value = &inarg; 1931 req->in.args[0].value = &inarg;
@@ -1889,6 +1944,7 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
1889 } 1944 }
1890 return POLLERR; 1945 return POLLERR;
1891} 1946}
1947EXPORT_SYMBOL_GPL(fuse_file_poll);
1892 1948
1893/* 1949/*
1894 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and 1950 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6fc5aedaa0d5..aaf2f9ff970e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -97,8 +97,13 @@ struct fuse_inode {
97 struct list_head writepages; 97 struct list_head writepages;
98}; 98};
99 99
100struct fuse_conn;
101
100/** FUSE specific file data */ 102/** FUSE specific file data */
101struct fuse_file { 103struct fuse_file {
104 /** Fuse connection for this file */
105 struct fuse_conn *fc;
106
102 /** Request reserved for flush and release */ 107 /** Request reserved for flush and release */
103 struct fuse_req *reserved_req; 108 struct fuse_req *reserved_req;
104 109
@@ -108,9 +113,15 @@ struct fuse_file {
108 /** File handle used by userspace */ 113 /** File handle used by userspace */
109 u64 fh; 114 u64 fh;
110 115
116 /** Node id of this file */
117 u64 nodeid;
118
111 /** Refcount */ 119 /** Refcount */
112 atomic_t count; 120 atomic_t count;
113 121
122 /** FOPEN_* flags returned by open */
123 u32 open_flags;
124
114 /** Entry on inode's write_files list */ 125 /** Entry on inode's write_files list */
115 struct list_head write_entry; 126 struct list_head write_entry;
116 127
@@ -185,8 +196,6 @@ enum fuse_req_state {
185 FUSE_REQ_FINISHED 196 FUSE_REQ_FINISHED
186}; 197};
187 198
188struct fuse_conn;
189
190/** 199/**
191 * A request to the client 200 * A request to the client
192 */ 201 */
@@ -248,11 +257,12 @@ struct fuse_req {
248 struct fuse_forget_in forget_in; 257 struct fuse_forget_in forget_in;
249 struct { 258 struct {
250 struct fuse_release_in in; 259 struct fuse_release_in in;
251 struct vfsmount *vfsmount; 260 struct path path;
252 struct dentry *dentry;
253 } release; 261 } release;
254 struct fuse_init_in init_in; 262 struct fuse_init_in init_in;
255 struct fuse_init_out init_out; 263 struct fuse_init_out init_out;
264 struct cuse_init_in cuse_init_in;
265 struct cuse_init_out cuse_init_out;
256 struct { 266 struct {
257 struct fuse_read_in in; 267 struct fuse_read_in in;
258 u64 attr_ver; 268 u64 attr_ver;
@@ -386,6 +396,9 @@ struct fuse_conn {
386 /** Filesystem supports NFS exporting. Only set in INIT */ 396 /** Filesystem supports NFS exporting. Only set in INIT */
387 unsigned export_support:1; 397 unsigned export_support:1;
388 398
399 /** Set if bdi is valid */
400 unsigned bdi_initialized:1;
401
389 /* 402 /*
390 * The following bitfields are only for optimization purposes 403 * The following bitfields are only for optimization purposes
391 * and hence races in setting them will not cause malfunction 404 * and hence races in setting them will not cause malfunction
@@ -515,25 +528,24 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
515 * Initialize READ or READDIR request 528 * Initialize READ or READDIR request
516 */ 529 */
517void fuse_read_fill(struct fuse_req *req, struct file *file, 530void fuse_read_fill(struct fuse_req *req, struct file *file,
518 struct inode *inode, loff_t pos, size_t count, int opcode); 531 loff_t pos, size_t count, int opcode);
519 532
520/** 533/**
521 * Send OPEN or OPENDIR request 534 * Send OPEN or OPENDIR request
522 */ 535 */
523int fuse_open_common(struct inode *inode, struct file *file, int isdir); 536int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
524 537
525struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); 538struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
539struct fuse_file *fuse_file_get(struct fuse_file *ff);
526void fuse_file_free(struct fuse_file *ff); 540void fuse_file_free(struct fuse_file *ff);
527void fuse_finish_open(struct inode *inode, struct file *file, 541void fuse_finish_open(struct inode *inode, struct file *file);
528 struct fuse_file *ff, struct fuse_open_out *outarg);
529 542
530/** Fill in ff->reserved_req with a RELEASE request */ 543void fuse_sync_release(struct fuse_file *ff, int flags);
531void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode);
532 544
533/** 545/**
534 * Send RELEASE or RELEASEDIR request 546 * Send RELEASE or RELEASEDIR request
535 */ 547 */
536int fuse_release_common(struct inode *inode, struct file *file, int isdir); 548void fuse_release_common(struct file *file, int opcode);
537 549
538/** 550/**
539 * Send FSYNC or FSYNCDIR request 551 * Send FSYNC or FSYNCDIR request
@@ -652,10 +664,12 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
652 */ 664 */
653struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); 665struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
654 666
667void fuse_conn_kill(struct fuse_conn *fc);
668
655/** 669/**
656 * Initialize fuse_conn 670 * Initialize fuse_conn
657 */ 671 */
658int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb); 672void fuse_conn_init(struct fuse_conn *fc);
659 673
660/** 674/**
661 * Release reference to fuse_conn 675 * Release reference to fuse_conn
@@ -694,4 +708,13 @@ void fuse_release_nowrite(struct inode *inode);
694 708
695u64 fuse_get_attr_version(struct fuse_conn *fc); 709u64 fuse_get_attr_version(struct fuse_conn *fc);
696 710
711int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
712 bool isdir);
713ssize_t fuse_direct_io(struct file *file, const char __user *buf,
714 size_t count, loff_t *ppos, int write);
715long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
716 unsigned int flags);
717unsigned fuse_file_poll(struct file *file, poll_table *wait);
718int fuse_dev_release(struct inode *inode, struct file *file);
719
697#endif /* _FS_FUSE_I_H */ 720#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 91f7c85f1ffd..f0df55a52929 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -277,11 +277,14 @@ static void fuse_send_destroy(struct fuse_conn *fc)
277 } 277 }
278} 278}
279 279
280static void fuse_put_super(struct super_block *sb) 280static void fuse_bdi_destroy(struct fuse_conn *fc)
281{ 281{
282 struct fuse_conn *fc = get_fuse_conn_super(sb); 282 if (fc->bdi_initialized)
283 bdi_destroy(&fc->bdi);
284}
283 285
284 fuse_send_destroy(fc); 286void fuse_conn_kill(struct fuse_conn *fc)
287{
285 spin_lock(&fc->lock); 288 spin_lock(&fc->lock);
286 fc->connected = 0; 289 fc->connected = 0;
287 fc->blocked = 0; 290 fc->blocked = 0;
@@ -295,7 +298,16 @@ static void fuse_put_super(struct super_block *sb)
295 list_del(&fc->entry); 298 list_del(&fc->entry);
296 fuse_ctl_remove_conn(fc); 299 fuse_ctl_remove_conn(fc);
297 mutex_unlock(&fuse_mutex); 300 mutex_unlock(&fuse_mutex);
298 bdi_destroy(&fc->bdi); 301 fuse_bdi_destroy(fc);
302}
303EXPORT_SYMBOL_GPL(fuse_conn_kill);
304
305static void fuse_put_super(struct super_block *sb)
306{
307 struct fuse_conn *fc = get_fuse_conn_super(sb);
308
309 fuse_send_destroy(fc);
310 fuse_conn_kill(fc);
299 fuse_conn_put(fc); 311 fuse_conn_put(fc);
300} 312}
301 313
@@ -466,10 +478,8 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
466 return 0; 478 return 0;
467} 479}
468 480
469int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb) 481void fuse_conn_init(struct fuse_conn *fc)
470{ 482{
471 int err;
472
473 memset(fc, 0, sizeof(*fc)); 483 memset(fc, 0, sizeof(*fc));
474 spin_lock_init(&fc->lock); 484 spin_lock_init(&fc->lock);
475 mutex_init(&fc->inst_mutex); 485 mutex_init(&fc->inst_mutex);
@@ -484,49 +494,12 @@ int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
484 INIT_LIST_HEAD(&fc->bg_queue); 494 INIT_LIST_HEAD(&fc->bg_queue);
485 INIT_LIST_HEAD(&fc->entry); 495 INIT_LIST_HEAD(&fc->entry);
486 atomic_set(&fc->num_waiting, 0); 496 atomic_set(&fc->num_waiting, 0);
487 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
488 fc->bdi.unplug_io_fn = default_unplug_io_fn;
489 /* fuse does it's own writeback accounting */
490 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
491 fc->khctr = 0; 497 fc->khctr = 0;
492 fc->polled_files = RB_ROOT; 498 fc->polled_files = RB_ROOT;
493 fc->dev = sb->s_dev;
494 err = bdi_init(&fc->bdi);
495 if (err)
496 goto error_mutex_destroy;
497 if (sb->s_bdev) {
498 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
499 MAJOR(fc->dev), MINOR(fc->dev));
500 } else {
501 err = bdi_register_dev(&fc->bdi, fc->dev);
502 }
503 if (err)
504 goto error_bdi_destroy;
505 /*
506 * For a single fuse filesystem use max 1% of dirty +
507 * writeback threshold.
508 *
509 * This gives about 1M of write buffer for memory maps on a
510 * machine with 1G and 10% dirty_ratio, which should be more
511 * than enough.
512 *
513 * Privileged users can raise it by writing to
514 *
515 * /sys/class/bdi/<bdi>/max_ratio
516 */
517 bdi_set_max_ratio(&fc->bdi, 1);
518 fc->reqctr = 0; 499 fc->reqctr = 0;
519 fc->blocked = 1; 500 fc->blocked = 1;
520 fc->attr_version = 1; 501 fc->attr_version = 1;
521 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); 502 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
522
523 return 0;
524
525 error_bdi_destroy:
526 bdi_destroy(&fc->bdi);
527 error_mutex_destroy:
528 mutex_destroy(&fc->inst_mutex);
529 return err;
530} 503}
531EXPORT_SYMBOL_GPL(fuse_conn_init); 504EXPORT_SYMBOL_GPL(fuse_conn_init);
532 505
@@ -539,12 +512,14 @@ void fuse_conn_put(struct fuse_conn *fc)
539 fc->release(fc); 512 fc->release(fc);
540 } 513 }
541} 514}
515EXPORT_SYMBOL_GPL(fuse_conn_put);
542 516
543struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) 517struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
544{ 518{
545 atomic_inc(&fc->count); 519 atomic_inc(&fc->count);
546 return fc; 520 return fc;
547} 521}
522EXPORT_SYMBOL_GPL(fuse_conn_get);
548 523
549static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) 524static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
550{ 525{
@@ -797,6 +772,48 @@ static void fuse_free_conn(struct fuse_conn *fc)
797 kfree(fc); 772 kfree(fc);
798} 773}
799 774
775static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
776{
777 int err;
778
779 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
780 fc->bdi.unplug_io_fn = default_unplug_io_fn;
781 /* fuse does it's own writeback accounting */
782 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
783
784 err = bdi_init(&fc->bdi);
785 if (err)
786 return err;
787
788 fc->bdi_initialized = 1;
789
790 if (sb->s_bdev) {
791 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
792 MAJOR(fc->dev), MINOR(fc->dev));
793 } else {
794 err = bdi_register_dev(&fc->bdi, fc->dev);
795 }
796
797 if (err)
798 return err;
799
800 /*
801 * For a single fuse filesystem use max 1% of dirty +
802 * writeback threshold.
803 *
804 * This gives about 1M of write buffer for memory maps on a
805 * machine with 1G and 10% dirty_ratio, which should be more
806 * than enough.
807 *
808 * Privileged users can raise it by writing to
809 *
810 * /sys/class/bdi/<bdi>/max_ratio
811 */
812 bdi_set_max_ratio(&fc->bdi, 1);
813
814 return 0;
815}
816
800static int fuse_fill_super(struct super_block *sb, void *data, int silent) 817static int fuse_fill_super(struct super_block *sb, void *data, int silent)
801{ 818{
802 struct fuse_conn *fc; 819 struct fuse_conn *fc;
@@ -843,11 +860,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
843 if (!fc) 860 if (!fc)
844 goto err_fput; 861 goto err_fput;
845 862
846 err = fuse_conn_init(fc, sb); 863 fuse_conn_init(fc);
847 if (err) { 864
848 kfree(fc); 865 fc->dev = sb->s_dev;
849 goto err_fput; 866 err = fuse_bdi_init(fc, sb);
850 } 867 if (err)
868 goto err_put_conn;
851 869
852 fc->release = fuse_free_conn; 870 fc->release = fuse_free_conn;
853 fc->flags = d.flags; 871 fc->flags = d.flags;
@@ -911,7 +929,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
911 err_put_root: 929 err_put_root:
912 dput(root_dentry); 930 dput(root_dentry);
913 err_put_conn: 931 err_put_conn:
914 bdi_destroy(&fc->bdi); 932 fuse_bdi_destroy(fc);
915 fuse_conn_put(fc); 933 fuse_conn_put(fc);
916 err_fput: 934 err_fput:
917 fput(file); 935 fput(file);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3a981b7f64ca..cad957cdb1e5 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,6 +7,7 @@ config GFS2_FS
7 select IP_SCTP if DLM_SCTP 7 select IP_SCTP if DLM_SCTP
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK
10 help 11 help
11 A cluster filesystem. 12 A cluster filesystem.
12 13
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index a851ea4bdf70..3da2f1f4f738 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,8 +1,9 @@
1EXTRA_CFLAGS := -I$(src)
1obj-$(CONFIG_GFS2_FS) += gfs2.o 2obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ 3gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o log.o lops.o main.o meta_io.o \ 4 glops.o inode.o log.o lops.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 5 aops.o dentry.o export.o file.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 6 ops_fstype.o ops_inode.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 7 recovery.o rgrp.o super.o sys.o trans.o util.o
7 8
8gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o 9gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/aops.c
index a6dde1751e17..03ebb439ace0 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/aops.c
@@ -28,7 +28,6 @@
28#include "inode.h" 28#include "inode.h"
29#include "log.h" 29#include "log.h"
30#include "meta_io.h" 30#include "meta_io.h"
31#include "ops_address.h"
32#include "quota.h" 31#include "quota.h"
33#include "trans.h" 32#include "trans.h"
34#include "rgrp.h" 33#include "rgrp.h"
@@ -781,10 +780,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
781 unlock_page(page); 780 unlock_page(page);
782 page_cache_release(page); 781 page_cache_release(page);
783 782
784 if (inode->i_size < to) { 783 if (copied) {
785 i_size_write(inode, to); 784 if (inode->i_size < to) {
786 ip->i_disksize = inode->i_size; 785 i_size_write(inode, to);
787 di->di_size = cpu_to_be64(inode->i_size); 786 ip->i_disksize = inode->i_size;
787 }
788 gfs2_dinode_out(ip, di);
788 mark_inode_dirty(inode); 789 mark_inode_dirty(inode);
789 } 790 }
790 791
@@ -824,7 +825,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
824 struct gfs2_sbd *sdp = GFS2_SB(inode); 825 struct gfs2_sbd *sdp = GFS2_SB(inode);
825 struct buffer_head *dibh; 826 struct buffer_head *dibh;
826 struct gfs2_alloc *al = ip->i_alloc; 827 struct gfs2_alloc *al = ip->i_alloc;
827 struct gfs2_dinode *di;
828 unsigned int from = pos & (PAGE_CACHE_SIZE - 1); 828 unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
829 unsigned int to = from + len; 829 unsigned int to = from + len;
830 int ret; 830 int ret;
@@ -847,11 +847,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
847 gfs2_page_add_databufs(ip, page, from, to); 847 gfs2_page_add_databufs(ip, page, from, to);
848 848
849 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 849 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
850 850 if (ret > 0) {
851 if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) { 851 if (inode->i_size > ip->i_disksize)
852 di = (struct gfs2_dinode *)dibh->b_data; 852 ip->i_disksize = inode->i_size;
853 ip->i_disksize = inode->i_size; 853 gfs2_dinode_out(ip, dibh->b_data);
854 di->di_size = cpu_to_be64(inode->i_size);
855 mark_inode_dirty(inode); 854 mark_inode_dirty(inode);
856 } 855 }
857 856
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3a5d3f883e10..6d47379e794b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -25,7 +25,7 @@
25#include "trans.h" 25#include "trans.h"
26#include "dir.h" 26#include "dir.h"
27#include "util.h" 27#include "util.h"
28#include "ops_address.h" 28#include "trace_gfs2.h"
29 29
30/* This doesn't need to be that large as max 64 bit pointers in a 4k 30/* This doesn't need to be that large as max 64 bit pointers in a 4k
31 * block is 512, so __u16 is fine for that. It saves stack space to 31 * block is 512, so __u16 is fine for that. It saves stack space to
@@ -136,7 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
136 and write it out to disk */ 136 and write it out to disk */
137 137
138 unsigned int n = 1; 138 unsigned int n = 1;
139 block = gfs2_alloc_block(ip, &n); 139 error = gfs2_alloc_block(ip, &block, &n);
140 if (error)
141 goto out_brelse;
140 if (isdir) { 142 if (isdir) {
141 gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1); 143 gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
142 error = gfs2_dir_get_new_buffer(ip, block, &bh); 144 error = gfs2_dir_get_new_buffer(ip, block, &bh);
@@ -476,8 +478,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
476 blks = dblks + iblks; 478 blks = dblks + iblks;
477 i = sheight; 479 i = sheight;
478 do { 480 do {
481 int error;
479 n = blks - alloced; 482 n = blks - alloced;
480 bn = gfs2_alloc_block(ip, &n); 483 error = gfs2_alloc_block(ip, &bn, &n);
484 if (error)
485 return error;
481 alloced += n; 486 alloced += n;
482 if (state != ALLOC_DATA || gfs2_is_jdata(ip)) 487 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
483 gfs2_trans_add_unrevoke(sdp, bn, n); 488 gfs2_trans_add_unrevoke(sdp, bn, n);
@@ -585,6 +590,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
585 clear_buffer_mapped(bh_map); 590 clear_buffer_mapped(bh_map);
586 clear_buffer_new(bh_map); 591 clear_buffer_new(bh_map);
587 clear_buffer_boundary(bh_map); 592 clear_buffer_boundary(bh_map);
593 trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
588 if (gfs2_is_dir(ip)) { 594 if (gfs2_is_dir(ip)) {
589 bsize = sdp->sd_jbsize; 595 bsize = sdp->sd_jbsize;
590 arr = sdp->sd_jheightsize; 596 arr = sdp->sd_jheightsize;
@@ -619,6 +625,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
619 ret = 0; 625 ret = 0;
620out: 626out:
621 release_metapath(&mp); 627 release_metapath(&mp);
628 trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
622 bmap_unlock(ip, create); 629 bmap_unlock(ip, create);
623 return ret; 630 return ret;
624 631
@@ -1008,7 +1015,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
1008 gfs2_trans_add_bh(ip->i_gl, bh, 0); 1015 gfs2_trans_add_bh(ip->i_gl, bh, 0);
1009 1016
1010 zero_user(page, offset, length); 1017 zero_user(page, offset, length);
1011 1018 mark_buffer_dirty(bh);
1012unlock: 1019unlock:
1013 unlock_page(page); 1020 unlock_page(page);
1014 page_cache_release(page); 1021 page_cache_release(page);
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..022c66cd5606 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/dentry.c
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index aef4d0c06748..297d7e5cebad 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
803{ 803{
804 struct gfs2_inode *ip = GFS2_I(inode); 804 struct gfs2_inode *ip = GFS2_I(inode);
805 unsigned int n = 1; 805 unsigned int n = 1;
806 u64 bn = gfs2_alloc_block(ip, &n); 806 u64 bn;
807 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn); 807 int error;
808 struct buffer_head *bh;
808 struct gfs2_leaf *leaf; 809 struct gfs2_leaf *leaf;
809 struct gfs2_dirent *dent; 810 struct gfs2_dirent *dent;
810 struct qstr name = { .name = "", .len = 0, .hash = 0 }; 811 struct qstr name = { .name = "", .len = 0, .hash = 0 };
812
813 error = gfs2_alloc_block(ip, &bn, &n);
814 if (error)
815 return NULL;
816 bh = gfs2_meta_new(ip->i_gl, bn);
811 if (!bh) 817 if (!bh)
812 return NULL; 818 return NULL;
819
813 gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); 820 gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
814 gfs2_trans_add_bh(ip->i_gl, bh, 1); 821 gfs2_trans_add_bh(ip->i_gl, bh, 1);
815 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); 822 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 899763aed217..07ea9529adda 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
582 struct gfs2_ea_header *ea; 582 struct gfs2_ea_header *ea;
583 unsigned int n = 1; 583 unsigned int n = 1;
584 u64 block; 584 u64 block;
585 int error;
585 586
586 block = gfs2_alloc_block(ip, &n); 587 error = gfs2_alloc_block(ip, &block, &n);
588 if (error)
589 return error;
587 gfs2_trans_add_unrevoke(sdp, block, 1); 590 gfs2_trans_add_unrevoke(sdp, block, 1);
588 *bhp = gfs2_meta_new(ip->i_gl, block); 591 *bhp = gfs2_meta_new(ip->i_gl, block);
589 gfs2_trans_add_bh(ip->i_gl, *bhp, 1); 592 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
617 struct gfs2_ea_request *er) 620 struct gfs2_ea_request *er)
618{ 621{
619 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 622 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
623 int error;
620 624
621 ea->ea_data_len = cpu_to_be32(er->er_data_len); 625 ea->ea_data_len = cpu_to_be32(er->er_data_len);
622 ea->ea_name_len = er->er_name_len; 626 ea->ea_name_len = er->er_name_len;
@@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
642 int mh_size = sizeof(struct gfs2_meta_header); 646 int mh_size = sizeof(struct gfs2_meta_header);
643 unsigned int n = 1; 647 unsigned int n = 1;
644 648
645 block = gfs2_alloc_block(ip, &n); 649 error = gfs2_alloc_block(ip, &block, &n);
650 if (error)
651 return error;
646 gfs2_trans_add_unrevoke(sdp, block, 1); 652 gfs2_trans_add_unrevoke(sdp, block, 1);
647 bh = gfs2_meta_new(ip->i_gl, block); 653 bh = gfs2_meta_new(ip->i_gl, block);
648 gfs2_trans_add_bh(ip->i_gl, bh, 1); 654 gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
963 } else { 969 } else {
964 u64 blk; 970 u64 blk;
965 unsigned int n = 1; 971 unsigned int n = 1;
966 blk = gfs2_alloc_block(ip, &n); 972 error = gfs2_alloc_block(ip, &blk, &n);
973 if (error)
974 return error;
967 gfs2_trans_add_unrevoke(sdp, blk, 1); 975 gfs2_trans_add_unrevoke(sdp, blk, 1);
968 indbh = gfs2_meta_new(ip->i_gl, blk); 976 indbh = gfs2_meta_new(ip->i_gl, blk);
969 gfs2_trans_add_bh(ip->i_gl, indbh, 1); 977 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/export.c
index 9200ef221716..9200ef221716 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/export.c
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/file.c
index 5d82e91887e3..73318a3ce6f1 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/file.c
@@ -39,7 +39,6 @@
39#include "trans.h" 39#include "trans.h"
40#include "util.h" 40#include "util.h"
41#include "eaops.h" 41#include "eaops.h"
42#include "ops_address.h"
43 42
44/** 43/**
45 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
@@ -425,33 +424,36 @@ static struct vm_operations_struct gfs2_vm_ops = {
425 .page_mkwrite = gfs2_page_mkwrite, 424 .page_mkwrite = gfs2_page_mkwrite,
426}; 425};
427 426
428
429/** 427/**
430 * gfs2_mmap - 428 * gfs2_mmap -
431 * @file: The file to map 429 * @file: The file to map
432 * @vma: The VMA which described the mapping 430 * @vma: The VMA which described the mapping
433 * 431 *
434 * Returns: 0 or error code 432 * There is no need to get a lock here unless we should be updating
433 * atime. We ignore any locking errors since the only consequence is
434 * a missed atime update (which will just be deferred until later).
435 *
436 * Returns: 0
435 */ 437 */
436 438
437static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) 439static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
438{ 440{
439 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 441 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
440 struct gfs2_holder i_gh;
441 int error;
442 442
443 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); 443 if (!(file->f_flags & O_NOATIME)) {
444 error = gfs2_glock_nq(&i_gh); 444 struct gfs2_holder i_gh;
445 if (error) { 445 int error;
446 gfs2_holder_uninit(&i_gh);
447 return error;
448 }
449 446
447 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
448 error = gfs2_glock_nq(&i_gh);
449 file_accessed(file);
450 if (error == 0)
451 gfs2_glock_dq_uninit(&i_gh);
452 }
450 vma->vm_ops = &gfs2_vm_ops; 453 vma->vm_ops = &gfs2_vm_ops;
454 vma->vm_flags |= VM_CAN_NONLINEAR;
451 455
452 gfs2_glock_dq_uninit(&i_gh); 456 return 0;
453
454 return error;
455} 457}
456 458
457/** 459/**
@@ -692,12 +694,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
692 694
693static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) 695static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
694{ 696{
695 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
696
697 if (!(fl->fl_flags & FL_FLOCK)) 697 if (!(fl->fl_flags & FL_FLOCK))
698 return -ENOLCK; 698 return -ENOLCK;
699 if (__mandatory_lock(&ip->i_inode)) 699 if (fl->fl_type & LOCK_MAND)
700 return -ENOLCK; 700 return -EOPNOTSUPP;
701 701
702 if (fl->fl_type == F_UNLCK) { 702 if (fl->fl_type == F_UNLCK) {
703 do_unflock(file, fl); 703 do_unflock(file, fl);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ff4981090489..297421c0427a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -39,6 +39,8 @@
39#include "super.h" 39#include "super.h"
40#include "util.h" 40#include "util.h"
41#include "bmap.h" 41#include "bmap.h"
42#define CREATE_TRACE_POINTS
43#include "trace_gfs2.h"
42 44
43struct gfs2_gl_hash_bucket { 45struct gfs2_gl_hash_bucket {
44 struct hlist_head hb_list; 46 struct hlist_head hb_list;
@@ -155,7 +157,7 @@ static void glock_free(struct gfs2_glock *gl)
155 157
156 if (aspace) 158 if (aspace)
157 gfs2_aspace_put(aspace); 159 gfs2_aspace_put(aspace);
158 160 trace_gfs2_glock_put(gl);
159 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); 161 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
160} 162}
161 163
@@ -317,14 +319,17 @@ restart:
317 return 2; 319 return 2;
318 gh->gh_error = ret; 320 gh->gh_error = ret;
319 list_del_init(&gh->gh_list); 321 list_del_init(&gh->gh_list);
322 trace_gfs2_glock_queue(gh, 0);
320 gfs2_holder_wake(gh); 323 gfs2_holder_wake(gh);
321 goto restart; 324 goto restart;
322 } 325 }
323 set_bit(HIF_HOLDER, &gh->gh_iflags); 326 set_bit(HIF_HOLDER, &gh->gh_iflags);
327 trace_gfs2_promote(gh, 1);
324 gfs2_holder_wake(gh); 328 gfs2_holder_wake(gh);
325 goto restart; 329 goto restart;
326 } 330 }
327 set_bit(HIF_HOLDER, &gh->gh_iflags); 331 set_bit(HIF_HOLDER, &gh->gh_iflags);
332 trace_gfs2_promote(gh, 0);
328 gfs2_holder_wake(gh); 333 gfs2_holder_wake(gh);
329 continue; 334 continue;
330 } 335 }
@@ -354,6 +359,7 @@ static inline void do_error(struct gfs2_glock *gl, const int ret)
354 else 359 else
355 continue; 360 continue;
356 list_del_init(&gh->gh_list); 361 list_del_init(&gh->gh_list);
362 trace_gfs2_glock_queue(gh, 0);
357 gfs2_holder_wake(gh); 363 gfs2_holder_wake(gh);
358 } 364 }
359} 365}
@@ -422,6 +428,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
422 int rv; 428 int rv;
423 429
424 spin_lock(&gl->gl_spin); 430 spin_lock(&gl->gl_spin);
431 trace_gfs2_glock_state_change(gl, state);
425 state_change(gl, state); 432 state_change(gl, state);
426 gh = find_first_waiter(gl); 433 gh = find_first_waiter(gl);
427 434
@@ -796,22 +803,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
796 gh->gh_ip = 0; 803 gh->gh_ip = 0;
797} 804}
798 805
799static int just_schedule(void *word) 806/**
807 * gfs2_glock_holder_wait
808 * @word: unused
809 *
810 * This function and gfs2_glock_demote_wait both show up in the WCHAN
811 * field. Thus I've separated these otherwise identical functions in
812 * order to be more informative to the user.
813 */
814
815static int gfs2_glock_holder_wait(void *word)
800{ 816{
801 schedule(); 817 schedule();
802 return 0; 818 return 0;
803} 819}
804 820
821static int gfs2_glock_demote_wait(void *word)
822{
823 schedule();
824 return 0;
825}
826
805static void wait_on_holder(struct gfs2_holder *gh) 827static void wait_on_holder(struct gfs2_holder *gh)
806{ 828{
807 might_sleep(); 829 might_sleep();
808 wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE); 830 wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
809} 831}
810 832
811static void wait_on_demote(struct gfs2_glock *gl) 833static void wait_on_demote(struct gfs2_glock *gl)
812{ 834{
813 might_sleep(); 835 might_sleep();
814 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE); 836 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
815} 837}
816 838
817/** 839/**
@@ -836,6 +858,7 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
836 gl->gl_demote_state != state) { 858 gl->gl_demote_state != state) {
837 gl->gl_demote_state = LM_ST_UNLOCKED; 859 gl->gl_demote_state = LM_ST_UNLOCKED;
838 } 860 }
861 trace_gfs2_demote_rq(gl);
839} 862}
840 863
841/** 864/**
@@ -921,6 +944,7 @@ fail:
921 goto do_cancel; 944 goto do_cancel;
922 return; 945 return;
923 } 946 }
947 trace_gfs2_glock_queue(gh, 1);
924 list_add_tail(&gh->gh_list, insert_pt); 948 list_add_tail(&gh->gh_list, insert_pt);
925do_cancel: 949do_cancel:
926 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); 950 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1017,6 +1041,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1017 !test_bit(GLF_DEMOTE, &gl->gl_flags)) 1041 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1018 fast_path = 1; 1042 fast_path = 1;
1019 } 1043 }
1044 trace_gfs2_glock_queue(gh, 0);
1020 spin_unlock(&gl->gl_spin); 1045 spin_unlock(&gl->gl_spin);
1021 if (likely(fast_path)) 1046 if (likely(fast_path))
1022 return; 1047 return;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 70f87f43afa2..d5e4ab155ca0 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -310,24 +310,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
310} 310}
311 311
312/** 312/**
313 * rgrp_go_dump - print out an rgrp
314 * @seq: The iterator
315 * @gl: The glock in question
316 *
317 */
318
319static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
320{
321 const struct gfs2_rgrpd *rgd = gl->gl_object;
322 if (rgd == NULL)
323 return 0;
324 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
325 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
326 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
327 return 0;
328}
329
330/**
331 * trans_go_sync - promote/demote the transaction glock 313 * trans_go_sync - promote/demote the transaction glock
332 * @gl: the glock 314 * @gl: the glock
333 * @state: the requested state 315 * @state: the requested state
@@ -410,7 +392,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
410 .go_demote_ok = rgrp_go_demote_ok, 392 .go_demote_ok = rgrp_go_demote_ok,
411 .go_lock = rgrp_go_lock, 393 .go_lock = rgrp_go_lock,
412 .go_unlock = rgrp_go_unlock, 394 .go_unlock = rgrp_go_unlock,
413 .go_dump = rgrp_go_dump, 395 .go_dump = gfs2_rgrp_dump,
414 .go_type = LM_TYPE_RGRP, 396 .go_type = LM_TYPE_RGRP,
415 .go_min_hold_time = HZ / 5, 397 .go_min_hold_time = HZ / 5,
416}; 398};
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 399d1b978049..225347fbff3c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,7 @@
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/workqueue.h> 14#include <linux/workqueue.h>
15#include <linux/slow-work.h>
15#include <linux/dlm.h> 16#include <linux/dlm.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
17 18
@@ -63,9 +64,12 @@ struct gfs2_log_element {
63 const struct gfs2_log_operations *le_ops; 64 const struct gfs2_log_operations *le_ops;
64}; 65};
65 66
67#define GBF_FULL 1
68
66struct gfs2_bitmap { 69struct gfs2_bitmap {
67 struct buffer_head *bi_bh; 70 struct buffer_head *bi_bh;
68 char *bi_clone; 71 char *bi_clone;
72 unsigned long bi_flags;
69 u32 bi_offset; 73 u32 bi_offset;
70 u32 bi_start; 74 u32 bi_start;
71 u32 bi_len; 75 u32 bi_len;
@@ -90,10 +94,11 @@ struct gfs2_rgrpd {
90 struct gfs2_sbd *rd_sbd; 94 struct gfs2_sbd *rd_sbd;
91 unsigned int rd_bh_count; 95 unsigned int rd_bh_count;
92 u32 rd_last_alloc; 96 u32 rd_last_alloc;
93 unsigned char rd_flags; 97 u32 rd_flags;
94#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ 98#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
95#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ 99#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
96#define GFS2_RDF_UPTODATE 0x04 /* rg is up to date */ 100#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
101#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
97}; 102};
98 103
99enum gfs2_state_bits { 104enum gfs2_state_bits {
@@ -376,11 +381,11 @@ struct gfs2_journal_extent {
376struct gfs2_jdesc { 381struct gfs2_jdesc {
377 struct list_head jd_list; 382 struct list_head jd_list;
378 struct list_head extent_list; 383 struct list_head extent_list;
379 384 struct slow_work jd_work;
380 struct inode *jd_inode; 385 struct inode *jd_inode;
386 unsigned long jd_flags;
387#define JDF_RECOVERY 1
381 unsigned int jd_jid; 388 unsigned int jd_jid;
382 int jd_dirty;
383
384 unsigned int jd_blocks; 389 unsigned int jd_blocks;
385}; 390};
386 391
@@ -390,9 +395,6 @@ struct gfs2_statfs_change_host {
390 s64 sc_dinodes; 395 s64 sc_dinodes;
391}; 396};
392 397
393#define GFS2_GLOCKD_DEFAULT 1
394#define GFS2_GLOCKD_MAX 16
395
396#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF 398#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
397#define GFS2_QUOTA_OFF 0 399#define GFS2_QUOTA_OFF 0
398#define GFS2_QUOTA_ACCOUNT 1 400#define GFS2_QUOTA_ACCOUNT 1
@@ -418,6 +420,7 @@ struct gfs2_args {
418 unsigned int ar_data:2; /* ordered/writeback */ 420 unsigned int ar_data:2; /* ordered/writeback */
419 unsigned int ar_meta:1; /* mount metafs */ 421 unsigned int ar_meta:1; /* mount metafs */
420 unsigned int ar_discard:1; /* discard requests */ 422 unsigned int ar_discard:1; /* discard requests */
423 int ar_commit; /* Commit interval */
421}; 424};
422 425
423struct gfs2_tune { 426struct gfs2_tune {
@@ -426,7 +429,6 @@ struct gfs2_tune {
426 unsigned int gt_incore_log_blocks; 429 unsigned int gt_incore_log_blocks;
427 unsigned int gt_log_flush_secs; 430 unsigned int gt_log_flush_secs;
428 431
429 unsigned int gt_recoverd_secs;
430 unsigned int gt_logd_secs; 432 unsigned int gt_logd_secs;
431 433
432 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ 434 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -447,6 +449,7 @@ enum {
447 SDF_JOURNAL_LIVE = 1, 449 SDF_JOURNAL_LIVE = 1,
448 SDF_SHUTDOWN = 2, 450 SDF_SHUTDOWN = 2,
449 SDF_NOBARRIERS = 3, 451 SDF_NOBARRIERS = 3,
452 SDF_NORECOVERY = 4,
450}; 453};
451 454
452#define GFS2_FSNAME_LEN 256 455#define GFS2_FSNAME_LEN 256
@@ -493,7 +496,6 @@ struct lm_lockstruct {
493 unsigned long ls_flags; 496 unsigned long ls_flags;
494 dlm_lockspace_t *ls_dlm; 497 dlm_lockspace_t *ls_dlm;
495 498
496 int ls_recover_jid;
497 int ls_recover_jid_done; 499 int ls_recover_jid_done;
498 int ls_recover_jid_status; 500 int ls_recover_jid_status;
499}; 501};
@@ -582,7 +584,6 @@ struct gfs2_sbd {
582 584
583 /* Daemon stuff */ 585 /* Daemon stuff */
584 586
585 struct task_struct *sd_recoverd_process;
586 struct task_struct *sd_logd_process; 587 struct task_struct *sd_logd_process;
587 struct task_struct *sd_quotad_process; 588 struct task_struct *sd_quotad_process;
588 589
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5a31d426116f..2f94bd723698 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -30,7 +30,6 @@
30#include "inode.h" 30#include "inode.h"
31#include "log.h" 31#include "log.h"
32#include "meta_io.h" 32#include "meta_io.h"
33#include "ops_address.h"
34#include "quota.h" 33#include "quota.h"
35#include "rgrp.h" 34#include "rgrp.h"
36#include "trans.h" 35#include "trans.h"
@@ -1047,154 +1046,7 @@ fail:
1047 return ERR_PTR(error); 1046 return ERR_PTR(error);
1048} 1047}
1049 1048
1050/** 1049static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1051 * gfs2_rmdiri - Remove a directory
1052 * @dip: The parent directory of the directory to be removed
1053 * @name: The name of the directory to be removed
1054 * @ip: The GFS2 inode of the directory to be removed
1055 *
1056 * Assumes Glocks on dip and ip are held
1057 *
1058 * Returns: errno
1059 */
1060
1061int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
1062 struct gfs2_inode *ip)
1063{
1064 struct qstr dotname;
1065 int error;
1066
1067 if (ip->i_entries != 2) {
1068 if (gfs2_consist_inode(ip))
1069 gfs2_dinode_print(ip);
1070 return -EIO;
1071 }
1072
1073 error = gfs2_dir_del(dip, name);
1074 if (error)
1075 return error;
1076
1077 error = gfs2_change_nlink(dip, -1);
1078 if (error)
1079 return error;
1080
1081 gfs2_str2qstr(&dotname, ".");
1082 error = gfs2_dir_del(ip, &dotname);
1083 if (error)
1084 return error;
1085
1086 gfs2_str2qstr(&dotname, "..");
1087 error = gfs2_dir_del(ip, &dotname);
1088 if (error)
1089 return error;
1090
1091 /* It looks odd, but it really should be done twice */
1092 error = gfs2_change_nlink(ip, -1);
1093 if (error)
1094 return error;
1095
1096 error = gfs2_change_nlink(ip, -1);
1097 if (error)
1098 return error;
1099
1100 return error;
1101}
1102
1103/*
1104 * gfs2_unlink_ok - check to see that a inode is still in a directory
1105 * @dip: the directory
1106 * @name: the name of the file
1107 * @ip: the inode
1108 *
1109 * Assumes that the lock on (at least) @dip is held.
1110 *
1111 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
1112 */
1113
1114int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
1115 const struct gfs2_inode *ip)
1116{
1117 int error;
1118
1119 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
1120 return -EPERM;
1121
1122 if ((dip->i_inode.i_mode & S_ISVTX) &&
1123 dip->i_inode.i_uid != current_fsuid() &&
1124 ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
1125 return -EPERM;
1126
1127 if (IS_APPEND(&dip->i_inode))
1128 return -EPERM;
1129
1130 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
1131 if (error)
1132 return error;
1133
1134 error = gfs2_dir_check(&dip->i_inode, name, ip);
1135 if (error)
1136 return error;
1137
1138 return 0;
1139}
1140
1141/**
1142 * gfs2_readlinki - return the contents of a symlink
1143 * @ip: the symlink's inode
1144 * @buf: a pointer to the buffer to be filled
1145 * @len: a pointer to the length of @buf
1146 *
1147 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1148 * to be freed by the caller.
1149 *
1150 * Returns: errno
1151 */
1152
1153int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1154{
1155 struct gfs2_holder i_gh;
1156 struct buffer_head *dibh;
1157 unsigned int x;
1158 int error;
1159
1160 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
1161 error = gfs2_glock_nq(&i_gh);
1162 if (error) {
1163 gfs2_holder_uninit(&i_gh);
1164 return error;
1165 }
1166
1167 if (!ip->i_disksize) {
1168 gfs2_consist_inode(ip);
1169 error = -EIO;
1170 goto out;
1171 }
1172
1173 error = gfs2_meta_inode_buffer(ip, &dibh);
1174 if (error)
1175 goto out;
1176
1177 x = ip->i_disksize + 1;
1178 if (x > *len) {
1179 *buf = kmalloc(x, GFP_NOFS);
1180 if (!*buf) {
1181 error = -ENOMEM;
1182 goto out_brelse;
1183 }
1184 }
1185
1186 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1187 *len = x;
1188
1189out_brelse:
1190 brelse(dibh);
1191out:
1192 gfs2_glock_dq_uninit(&i_gh);
1193 return error;
1194}
1195
1196static int
1197__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1198{ 1050{
1199 struct buffer_head *dibh; 1051 struct buffer_head *dibh;
1200 int error; 1052 int error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c30be2b66580..c341aaf67adb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -11,8 +11,16 @@
11#define __INODE_DOT_H__ 11#define __INODE_DOT_H__
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
14#include "util.h" 16#include "util.h"
15 17
18extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
19extern int gfs2_internal_read(struct gfs2_inode *ip,
20 struct file_ra_state *ra_state,
21 char *buf, loff_t *pos, unsigned size);
22extern void gfs2_set_aops(struct inode *inode);
23
16static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 24static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
17{ 25{
18 return !ip->i_height; 26 return !ip->i_height;
@@ -73,30 +81,26 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
73} 81}
74 82
75 83
76void gfs2_set_iop(struct inode *inode); 84extern void gfs2_set_iop(struct inode *inode);
77struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
78 u64 no_addr, u64 no_formal_ino, 86 u64 no_addr, u64 no_formal_ino,
79 int skip_freeing); 87 int skip_freeing);
80struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
81 89
82int gfs2_inode_refresh(struct gfs2_inode *ip); 90extern int gfs2_inode_refresh(struct gfs2_inode *ip);
83 91
84int gfs2_dinode_dealloc(struct gfs2_inode *inode); 92extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
85int gfs2_change_nlink(struct gfs2_inode *ip, int diff); 93extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
86struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, 94extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
87 int is_root); 95 int is_root);
88struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, 96extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
89 unsigned int mode, dev_t dev); 97 const struct qstr *name,
90int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, 98 unsigned int mode, dev_t dev);
91 struct gfs2_inode *ip); 99extern int gfs2_permission(struct inode *inode, int mask);
92int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, 100extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
93 const struct gfs2_inode *ip); 101extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
94int gfs2_permission(struct inode *inode, int mask); 102extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
95int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); 103extern void gfs2_dinode_print(const struct gfs2_inode *ip);
96int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
97struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
98void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
99void gfs2_dinode_print(const struct gfs2_inode *ip);
100 104
101extern const struct inode_operations gfs2_file_iops; 105extern const struct inode_operations gfs2_file_iops;
102extern const struct inode_operations gfs2_dir_iops; 106extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 98918a756410..13c6237c5f67 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -28,6 +28,7 @@
28#include "meta_io.h" 28#include "meta_io.h"
29#include "util.h" 29#include "util.h"
30#include "dir.h" 30#include "dir.h"
31#include "trace_gfs2.h"
31 32
32#define PULL 1 33#define PULL 1
33 34
@@ -120,7 +121,7 @@ __acquires(&sdp->sd_log_lock)
120 lock_buffer(bh); 121 lock_buffer(bh);
121 if (test_clear_buffer_dirty(bh)) { 122 if (test_clear_buffer_dirty(bh)) {
122 bh->b_end_io = end_buffer_write_sync; 123 bh->b_end_io = end_buffer_write_sync;
123 submit_bh(WRITE, bh); 124 submit_bh(WRITE_SYNC_PLUG, bh);
124 } else { 125 } else {
125 unlock_buffer(bh); 126 unlock_buffer(bh);
126 brelse(bh); 127 brelse(bh);
@@ -313,6 +314,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
313 gfs2_log_lock(sdp); 314 gfs2_log_lock(sdp);
314 } 315 }
315 atomic_sub(blks, &sdp->sd_log_blks_free); 316 atomic_sub(blks, &sdp->sd_log_blks_free);
317 trace_gfs2_log_blocks(sdp, -blks);
316 gfs2_log_unlock(sdp); 318 gfs2_log_unlock(sdp);
317 mutex_unlock(&sdp->sd_log_reserve_mutex); 319 mutex_unlock(&sdp->sd_log_reserve_mutex);
318 320
@@ -333,6 +335,7 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
333 335
334 gfs2_log_lock(sdp); 336 gfs2_log_lock(sdp);
335 atomic_add(blks, &sdp->sd_log_blks_free); 337 atomic_add(blks, &sdp->sd_log_blks_free);
338 trace_gfs2_log_blocks(sdp, blks);
336 gfs2_assert_withdraw(sdp, 339 gfs2_assert_withdraw(sdp,
337 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); 340 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
338 gfs2_log_unlock(sdp); 341 gfs2_log_unlock(sdp);
@@ -558,6 +561,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
558 561
559 gfs2_log_lock(sdp); 562 gfs2_log_lock(sdp);
560 atomic_add(dist, &sdp->sd_log_blks_free); 563 atomic_add(dist, &sdp->sd_log_blks_free);
564 trace_gfs2_log_blocks(sdp, dist);
561 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); 565 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
562 gfs2_log_unlock(sdp); 566 gfs2_log_unlock(sdp);
563 567
@@ -604,7 +608,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
604 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 608 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
605 goto skip_barrier; 609 goto skip_barrier;
606 get_bh(bh); 610 get_bh(bh);
607 submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); 611 submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
608 wait_on_buffer(bh); 612 wait_on_buffer(bh);
609 if (buffer_eopnotsupp(bh)) { 613 if (buffer_eopnotsupp(bh)) {
610 clear_buffer_eopnotsupp(bh); 614 clear_buffer_eopnotsupp(bh);
@@ -664,7 +668,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
664 lock_buffer(bh); 668 lock_buffer(bh);
665 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { 669 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
666 bh->b_end_io = end_buffer_write_sync; 670 bh->b_end_io = end_buffer_write_sync;
667 submit_bh(WRITE, bh); 671 submit_bh(WRITE_SYNC_PLUG, bh);
668 } else { 672 } else {
669 unlock_buffer(bh); 673 unlock_buffer(bh);
670 brelse(bh); 674 brelse(bh);
@@ -715,6 +719,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
715 up_write(&sdp->sd_log_flush_lock); 719 up_write(&sdp->sd_log_flush_lock);
716 return; 720 return;
717 } 721 }
722 trace_gfs2_log_flush(sdp, 1);
718 723
719 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); 724 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
720 INIT_LIST_HEAD(&ai->ai_ail1_list); 725 INIT_LIST_HEAD(&ai->ai_ail1_list);
@@ -746,6 +751,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
746 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ 751 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
747 gfs2_log_lock(sdp); 752 gfs2_log_lock(sdp);
748 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ 753 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
754 trace_gfs2_log_blocks(sdp, -1);
749 gfs2_log_unlock(sdp); 755 gfs2_log_unlock(sdp);
750 log_write_header(sdp, 0, PULL); 756 log_write_header(sdp, 0, PULL);
751 } 757 }
@@ -763,8 +769,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
763 ai = NULL; 769 ai = NULL;
764 } 770 }
765 gfs2_log_unlock(sdp); 771 gfs2_log_unlock(sdp);
766 772 trace_gfs2_log_flush(sdp, 0);
767 sdp->sd_vfs->s_dirt = 0;
768 up_write(&sdp->sd_log_flush_lock); 773 up_write(&sdp->sd_log_flush_lock);
769 774
770 kfree(ai); 775 kfree(ai);
@@ -788,6 +793,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
788 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 793 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
789 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 794 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
790 atomic_add(unused, &sdp->sd_log_blks_free); 795 atomic_add(unused, &sdp->sd_log_blks_free);
796 trace_gfs2_log_blocks(sdp, unused);
791 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= 797 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
792 sdp->sd_jdesc->jd_blocks); 798 sdp->sd_jdesc->jd_blocks);
793 sdp->sd_log_blks_reserved = reserved; 799 sdp->sd_log_blks_reserved = reserved;
@@ -823,7 +829,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
823 log_refund(sdp, tr); 829 log_refund(sdp, tr);
824 buf_lo_incore_commit(sdp, tr); 830 buf_lo_incore_commit(sdp, tr);
825 831
826 sdp->sd_vfs->s_dirt = 1;
827 up_read(&sdp->sd_log_flush_lock); 832 up_read(&sdp->sd_log_flush_lock);
828 833
829 gfs2_log_lock(sdp); 834 gfs2_log_lock(sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 80e4f5f898bb..9969ff062c5b 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,6 +13,8 @@
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/bio.h>
17#include <linux/fs.h>
16 18
17#include "gfs2.h" 19#include "gfs2.h"
18#include "incore.h" 20#include "incore.h"
@@ -25,6 +27,7 @@
25#include "rgrp.h" 27#include "rgrp.h"
26#include "trans.h" 28#include "trans.h"
27#include "util.h" 29#include "util.h"
30#include "trace_gfs2.h"
28 31
29/** 32/**
30 * gfs2_pin - Pin a buffer in memory 33 * gfs2_pin - Pin a buffer in memory
@@ -51,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
51 if (bd->bd_ail) 54 if (bd->bd_ail)
52 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); 55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
53 get_bh(bh); 56 get_bh(bh);
57 trace_gfs2_pin(bd, 1);
54} 58}
55 59
56/** 60/**
@@ -87,6 +91,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
87 bd->bd_ail = ai; 91 bd->bd_ail = ai;
88 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); 92 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
89 clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); 93 clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
94 trace_gfs2_pin(bd, 0);
90 gfs2_log_unlock(sdp); 95 gfs2_log_unlock(sdp);
91 unlock_buffer(bh); 96 unlock_buffer(bh);
92} 97}
@@ -189,7 +194,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
189 } 194 }
190 195
191 gfs2_log_unlock(sdp); 196 gfs2_log_unlock(sdp);
192 submit_bh(WRITE, bh); 197 submit_bh(WRITE_SYNC_PLUG, bh);
193 gfs2_log_lock(sdp); 198 gfs2_log_lock(sdp);
194 199
195 n = 0; 200 n = 0;
@@ -199,7 +204,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
199 gfs2_log_unlock(sdp); 204 gfs2_log_unlock(sdp);
200 lock_buffer(bd2->bd_bh); 205 lock_buffer(bd2->bd_bh);
201 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); 206 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
202 submit_bh(WRITE, bh); 207 submit_bh(WRITE_SYNC_PLUG, bh);
203 gfs2_log_lock(sdp); 208 gfs2_log_lock(sdp);
204 if (++n >= num) 209 if (++n >= num)
205 break; 210 break;
@@ -341,7 +346,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
341 sdp->sd_log_num_revoke--; 346 sdp->sd_log_num_revoke--;
342 347
343 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { 348 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
344 submit_bh(WRITE, bh); 349 submit_bh(WRITE_SYNC_PLUG, bh);
345 350
346 bh = gfs2_log_get_buf(sdp); 351 bh = gfs2_log_get_buf(sdp);
347 mh = (struct gfs2_meta_header *)bh->b_data; 352 mh = (struct gfs2_meta_header *)bh->b_data;
@@ -358,7 +363,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
358 } 363 }
359 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 364 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
360 365
361 submit_bh(WRITE, bh); 366 submit_bh(WRITE_SYNC_PLUG, bh);
362} 367}
363 368
364static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 369static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -560,7 +565,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
560 ptr = bh_log_ptr(bh); 565 ptr = bh_log_ptr(bh);
561 566
562 get_bh(bh); 567 get_bh(bh);
563 submit_bh(WRITE, bh); 568 submit_bh(WRITE_SYNC_PLUG, bh);
564 gfs2_log_lock(sdp); 569 gfs2_log_lock(sdp);
565 while(!list_empty(list)) { 570 while(!list_empty(list)) {
566 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); 571 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -586,7 +591,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
586 } else { 591 } else {
587 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); 592 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
588 } 593 }
589 submit_bh(WRITE, bh1); 594 submit_bh(WRITE_SYNC_PLUG, bh1);
590 gfs2_log_lock(sdp); 595 gfs2_log_lock(sdp);
591 ptr += 2; 596 ptr += 2;
592 } 597 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a6892ed0840a..eacd78a5d082 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,6 +15,7 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <asm/atomic.h> 17#include <asm/atomic.h>
18#include <linux/slow-work.h>
18 19
19#include "gfs2.h" 20#include "gfs2.h"
20#include "incore.h" 21#include "incore.h"
@@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void)
113 if (error) 114 if (error)
114 goto fail_unregister; 115 goto fail_unregister;
115 116
117 error = slow_work_register_user();
118 if (error)
119 goto fail_slow;
120
116 gfs2_register_debugfs(); 121 gfs2_register_debugfs();
117 122
118 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__); 123 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
119 124
120 return 0; 125 return 0;
121 126
127fail_slow:
128 unregister_filesystem(&gfs2meta_fs_type);
122fail_unregister: 129fail_unregister:
123 unregister_filesystem(&gfs2_fs_type); 130 unregister_filesystem(&gfs2_fs_type);
124fail: 131fail:
@@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void)
156 gfs2_unregister_debugfs(); 163 gfs2_unregister_debugfs();
157 unregister_filesystem(&gfs2_fs_type); 164 unregister_filesystem(&gfs2_fs_type);
158 unregister_filesystem(&gfs2meta_fs_type); 165 unregister_filesystem(&gfs2meta_fs_type);
166 slow_work_unregister_user();
159 167
160 kmem_cache_destroy(gfs2_quotad_cachep); 168 kmem_cache_destroy(gfs2_quotad_cachep);
161 kmem_cache_destroy(gfs2_rgrpd_cachep); 169 kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8d6f13256b26..cb8d7a93d5ec 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,19 +31,66 @@
31#include "rgrp.h" 31#include "rgrp.h"
32#include "trans.h" 32#include "trans.h"
33#include "util.h" 33#include "util.h"
34#include "ops_address.h"
35 34
36static int aspace_get_block(struct inode *inode, sector_t lblock, 35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
37 struct buffer_head *bh_result, int create)
38{ 36{
39 gfs2_assert_warn(inode->i_sb->s_fs_info, 0); 37 int err;
40 return -EOPNOTSUPP; 38 struct buffer_head *bh, *head;
41} 39 int nr_underway = 0;
40 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
41 WRITE_SYNC_PLUG : WRITE));
42
43 BUG_ON(!PageLocked(page));
44 BUG_ON(!page_has_buffers(page));
45
46 head = page_buffers(page);
47 bh = head;
48
49 do {
50 if (!buffer_mapped(bh))
51 continue;
52 /*
53 * If it's a fully non-blocking write attempt and we cannot
54 * lock the buffer then redirty the page. Note that this can
55 * potentially cause a busy-wait loop from pdflush and kswapd
56 * activity, but those code paths have their own higher-level
57 * throttling.
58 */
59 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
60 lock_buffer(bh);
61 } else if (!trylock_buffer(bh)) {
62 redirty_page_for_writepage(wbc, page);
63 continue;
64 }
65 if (test_clear_buffer_dirty(bh)) {
66 mark_buffer_async_write(bh);
67 } else {
68 unlock_buffer(bh);
69 }
70 } while ((bh = bh->b_this_page) != head);
71
72 /*
73 * The page and its buffers are protected by PageWriteback(), so we can
74 * drop the bh refcounts early.
75 */
76 BUG_ON(PageWriteback(page));
77 set_page_writeback(page);
78
79 do {
80 struct buffer_head *next = bh->b_this_page;
81 if (buffer_async_write(bh)) {
82 submit_bh(write_op, bh);
83 nr_underway++;
84 }
85 bh = next;
86 } while (bh != head);
87 unlock_page(page);
42 88
43static int gfs2_aspace_writepage(struct page *page, 89 err = 0;
44 struct writeback_control *wbc) 90 if (nr_underway == 0)
45{ 91 end_page_writeback(page);
46 return block_write_full_page(page, aspace_get_block, wbc); 92
93 return err;
47} 94}
48 95
49static const struct address_space_operations aspace_aops = { 96static const struct address_space_operations aspace_aops = {
@@ -201,16 +248,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
201int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, 248int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
202 struct buffer_head **bhp) 249 struct buffer_head **bhp)
203{ 250{
204 *bhp = gfs2_getbuf(gl, blkno, CREATE); 251 struct gfs2_sbd *sdp = gl->gl_sbd;
205 if (!buffer_uptodate(*bhp)) { 252 struct buffer_head *bh;
206 ll_rw_block(READ_META, 1, bhp); 253
207 if (flags & DIO_WAIT) { 254 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
208 int error = gfs2_meta_wait(gl->gl_sbd, *bhp); 255 return -EIO;
209 if (error) { 256
210 brelse(*bhp); 257 *bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
211 return error; 258
212 } 259 lock_buffer(bh);
213 } 260 if (buffer_uptodate(bh)) {
261 unlock_buffer(bh);
262 return 0;
263 }
264 bh->b_end_io = end_buffer_read_sync;
265 get_bh(bh);
266 submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
267 if (!(flags & DIO_WAIT))
268 return 0;
269
270 wait_on_buffer(bh);
271 if (unlikely(!buffer_uptodate(bh))) {
272 struct gfs2_trans *tr = current->journal_info;
273 if (tr && tr->tr_touched)
274 gfs2_io_error_bh(sdp, bh);
275 brelse(bh);
276 return -EIO;
214 } 277 }
215 278
216 return 0; 279 return 0;
@@ -404,7 +467,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
404 if (buffer_uptodate(first_bh)) 467 if (buffer_uptodate(first_bh))
405 goto out; 468 goto out;
406 if (!buffer_locked(first_bh)) 469 if (!buffer_locked(first_bh))
407 ll_rw_block(READ_META, 1, &first_bh); 470 ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
408 471
409 dblock++; 472 dblock++;
410 extlen--; 473 extlen--;
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
deleted file mode 100644
index f7e8527a21e0..000000000000
--- a/fs/gfs2/mount.c
+++ /dev/null
@@ -1,185 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/completion.h>
13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h>
15#include <linux/parser.h>
16
17#include "gfs2.h"
18#include "incore.h"
19#include "super.h"
20#include "sys.h"
21#include "util.h"
22
23enum {
24 Opt_lockproto,
25 Opt_locktable,
26 Opt_hostdata,
27 Opt_spectator,
28 Opt_ignore_local_fs,
29 Opt_localflocks,
30 Opt_localcaching,
31 Opt_debug,
32 Opt_nodebug,
33 Opt_upgrade,
34 Opt_acl,
35 Opt_noacl,
36 Opt_quota_off,
37 Opt_quota_account,
38 Opt_quota_on,
39 Opt_quota,
40 Opt_noquota,
41 Opt_suiddir,
42 Opt_nosuiddir,
43 Opt_data_writeback,
44 Opt_data_ordered,
45 Opt_meta,
46 Opt_discard,
47 Opt_nodiscard,
48 Opt_err,
49};
50
51static const match_table_t tokens = {
52 {Opt_lockproto, "lockproto=%s"},
53 {Opt_locktable, "locktable=%s"},
54 {Opt_hostdata, "hostdata=%s"},
55 {Opt_spectator, "spectator"},
56 {Opt_ignore_local_fs, "ignore_local_fs"},
57 {Opt_localflocks, "localflocks"},
58 {Opt_localcaching, "localcaching"},
59 {Opt_debug, "debug"},
60 {Opt_nodebug, "nodebug"},
61 {Opt_upgrade, "upgrade"},
62 {Opt_acl, "acl"},
63 {Opt_noacl, "noacl"},
64 {Opt_quota_off, "quota=off"},
65 {Opt_quota_account, "quota=account"},
66 {Opt_quota_on, "quota=on"},
67 {Opt_quota, "quota"},
68 {Opt_noquota, "noquota"},
69 {Opt_suiddir, "suiddir"},
70 {Opt_nosuiddir, "nosuiddir"},
71 {Opt_data_writeback, "data=writeback"},
72 {Opt_data_ordered, "data=ordered"},
73 {Opt_meta, "meta"},
74 {Opt_discard, "discard"},
75 {Opt_nodiscard, "nodiscard"},
76 {Opt_err, NULL}
77};
78
79/**
80 * gfs2_mount_args - Parse mount options
81 * @sdp:
82 * @data:
83 *
84 * Return: errno
85 */
86
87int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
88{
89 char *o;
90 int token;
91 substring_t tmp[MAX_OPT_ARGS];
92
93 /* Split the options into tokens with the "," character and
94 process them */
95
96 while (1) {
97 o = strsep(&options, ",");
98 if (o == NULL)
99 break;
100 if (*o == '\0')
101 continue;
102
103 token = match_token(o, tokens, tmp);
104 switch (token) {
105 case Opt_lockproto:
106 match_strlcpy(args->ar_lockproto, &tmp[0],
107 GFS2_LOCKNAME_LEN);
108 break;
109 case Opt_locktable:
110 match_strlcpy(args->ar_locktable, &tmp[0],
111 GFS2_LOCKNAME_LEN);
112 break;
113 case Opt_hostdata:
114 match_strlcpy(args->ar_hostdata, &tmp[0],
115 GFS2_LOCKNAME_LEN);
116 break;
117 case Opt_spectator:
118 args->ar_spectator = 1;
119 break;
120 case Opt_ignore_local_fs:
121 args->ar_ignore_local_fs = 1;
122 break;
123 case Opt_localflocks:
124 args->ar_localflocks = 1;
125 break;
126 case Opt_localcaching:
127 args->ar_localcaching = 1;
128 break;
129 case Opt_debug:
130 args->ar_debug = 1;
131 break;
132 case Opt_nodebug:
133 args->ar_debug = 0;
134 break;
135 case Opt_upgrade:
136 args->ar_upgrade = 1;
137 break;
138 case Opt_acl:
139 args->ar_posix_acl = 1;
140 break;
141 case Opt_noacl:
142 args->ar_posix_acl = 0;
143 break;
144 case Opt_quota_off:
145 case Opt_noquota:
146 args->ar_quota = GFS2_QUOTA_OFF;
147 break;
148 case Opt_quota_account:
149 args->ar_quota = GFS2_QUOTA_ACCOUNT;
150 break;
151 case Opt_quota_on:
152 case Opt_quota:
153 args->ar_quota = GFS2_QUOTA_ON;
154 break;
155 case Opt_suiddir:
156 args->ar_suiddir = 1;
157 break;
158 case Opt_nosuiddir:
159 args->ar_suiddir = 0;
160 break;
161 case Opt_data_writeback:
162 args->ar_data = GFS2_DATA_WRITEBACK;
163 break;
164 case Opt_data_ordered:
165 args->ar_data = GFS2_DATA_ORDERED;
166 break;
167 case Opt_meta:
168 args->ar_meta = 1;
169 break;
170 case Opt_discard:
171 args->ar_discard = 1;
172 break;
173 case Opt_nodiscard:
174 args->ar_discard = 0;
175 break;
176 case Opt_err:
177 default:
178 fs_info(sdp, "invalid mount option: %s\n", o);
179 return -EINVAL;
180 }
181 }
182
183 return 0;
184}
185
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
deleted file mode 100644
index 5da21285bba4..000000000000
--- a/fs/gfs2/ops_address.h
+++ /dev/null
@@ -1,23 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13#include <linux/fs.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16
17extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
18extern int gfs2_internal_read(struct gfs2_inode *ip,
19 struct file_ra_state *ra_state,
20 char *buf, loff_t *pos, unsigned size);
21extern void gfs2_set_aops(struct inode *inode);
22
23#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea753..7bc3c45cd676 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,6 +17,7 @@
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/slow-work.h>
20 21
21#include "gfs2.h" 22#include "gfs2.h"
22#include "incore.h" 23#include "incore.h"
@@ -32,6 +33,7 @@
32#include "log.h" 33#include "log.h"
33#include "quota.h" 34#include "quota.h"
34#include "dir.h" 35#include "dir.h"
36#include "trace_gfs2.h"
35 37
36#define DO 0 38#define DO 0
37#define UNDO 1 39#define UNDO 1
@@ -55,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
55 spin_lock_init(&gt->gt_spin); 57 spin_lock_init(&gt->gt_spin);
56 58
57 gt->gt_incore_log_blocks = 1024; 59 gt->gt_incore_log_blocks = 1024;
58 gt->gt_log_flush_secs = 60;
59 gt->gt_recoverd_secs = 60;
60 gt->gt_logd_secs = 1; 60 gt->gt_logd_secs = 1;
61 gt->gt_quota_simul_sync = 64; 61 gt->gt_quota_simul_sync = 64;
62 gt->gt_quota_warn_period = 10; 62 gt->gt_quota_warn_period = 10;
@@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
526 } 526 }
527 527
528 /* Set up the buffer cache and SB for real */ 528 /* Set up the buffer cache and SB for real */
529 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { 529 if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
530 ret = -EINVAL; 530 ret = -EINVAL;
531 fs_err(sdp, "FS block size (%u) is too small for device " 531 fs_err(sdp, "FS block size (%u) is too small for device "
532 "block size (%u)\n", 532 "block size (%u)\n",
533 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); 533 sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
534 goto out; 534 goto out;
535 } 535 }
536 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { 536 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
@@ -676,6 +676,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
676 break; 676 break;
677 677
678 INIT_LIST_HEAD(&jd->extent_list); 678 INIT_LIST_HEAD(&jd->extent_list);
679 slow_work_init(&jd->jd_work, &gfs2_recover_ops);
679 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); 680 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
680 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 681 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
681 if (!jd->jd_inode) 682 if (!jd->jd_inode)
@@ -701,14 +702,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
701{ 702{
702 struct inode *master = sdp->sd_master_dir->d_inode; 703 struct inode *master = sdp->sd_master_dir->d_inode;
703 struct gfs2_holder ji_gh; 704 struct gfs2_holder ji_gh;
704 struct task_struct *p;
705 struct gfs2_inode *ip; 705 struct gfs2_inode *ip;
706 int jindex = 1; 706 int jindex = 1;
707 int error = 0; 707 int error = 0;
708 708
709 if (undo) { 709 if (undo) {
710 jindex = 0; 710 jindex = 0;
711 goto fail_recoverd; 711 goto fail_jinode_gh;
712 } 712 }
713 713
714 sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); 714 sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
@@ -776,6 +776,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
776 /* Map the extents for this journal's blocks */ 776 /* Map the extents for this journal's blocks */
777 map_journal_extents(sdp); 777 map_journal_extents(sdp);
778 } 778 }
779 trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
779 780
780 if (sdp->sd_lockstruct.ls_first) { 781 if (sdp->sd_lockstruct.ls_first) {
781 unsigned int x; 782 unsigned int x;
@@ -801,18 +802,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
801 gfs2_glock_dq_uninit(&ji_gh); 802 gfs2_glock_dq_uninit(&ji_gh);
802 jindex = 0; 803 jindex = 0;
803 804
804 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
805 error = IS_ERR(p);
806 if (error) {
807 fs_err(sdp, "can't start recoverd thread: %d\n", error);
808 goto fail_jinode_gh;
809 }
810 sdp->sd_recoverd_process = p;
811
812 return 0; 805 return 0;
813 806
814fail_recoverd:
815 kthread_stop(sdp->sd_recoverd_process);
816fail_jinode_gh: 807fail_jinode_gh:
817 if (!sdp->sd_args.ar_spectator) 808 if (!sdp->sd_args.ar_spectator)
818 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); 809 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
@@ -1165,6 +1156,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1165 1156
1166 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; 1157 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
1167 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; 1158 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
1159 sdp->sd_args.ar_commit = 60;
1168 1160
1169 error = gfs2_mount_args(sdp, &sdp->sd_args, data); 1161 error = gfs2_mount_args(sdp, &sdp->sd_args, data);
1170 if (error) { 1162 if (error) {
@@ -1172,8 +1164,10 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1172 goto fail; 1164 goto fail;
1173 } 1165 }
1174 1166
1175 if (sdp->sd_args.ar_spectator) 1167 if (sdp->sd_args.ar_spectator) {
1176 sb->s_flags |= MS_RDONLY; 1168 sb->s_flags |= MS_RDONLY;
1169 set_bit(SDF_NORECOVERY, &sdp->sd_flags);
1170 }
1177 if (sdp->sd_args.ar_posix_acl) 1171 if (sdp->sd_args.ar_posix_acl)
1178 sb->s_flags |= MS_POSIXACL; 1172 sb->s_flags |= MS_POSIXACL;
1179 1173
@@ -1191,6 +1185,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1191 GFS2_BASIC_BLOCK_SHIFT; 1185 GFS2_BASIC_BLOCK_SHIFT;
1192 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; 1186 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
1193 1187
1188 sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
1189
1194 error = init_names(sdp, silent); 1190 error = init_names(sdp, silent);
1195 if (error) 1191 if (error)
1196 goto fail; 1192 goto fail;
@@ -1279,9 +1275,22 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1279 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); 1275 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
1280} 1276}
1281 1277
1282static struct super_block *get_gfs2_sb(const char *dev_name) 1278static int test_meta_super(struct super_block *s, void *ptr)
1279{
1280 struct block_device *bdev = ptr;
1281 return (bdev == s->s_bdev);
1282}
1283
1284static int set_meta_super(struct super_block *s, void *ptr)
1283{ 1285{
1284 struct super_block *sb; 1286 return -EINVAL;
1287}
1288
1289static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1290 const char *dev_name, void *data, struct vfsmount *mnt)
1291{
1292 struct super_block *s;
1293 struct gfs2_sbd *sdp;
1285 struct path path; 1294 struct path path;
1286 int error; 1295 int error;
1287 1296
@@ -1289,30 +1298,17 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
1289 if (error) { 1298 if (error) {
1290 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1299 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
1291 dev_name, error); 1300 dev_name, error);
1292 return NULL; 1301 return error;
1293 } 1302 }
1294 sb = path.dentry->d_inode->i_sb; 1303 s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
1295 if (sb && (sb->s_type == &gfs2_fs_type)) 1304 path.dentry->d_inode->i_sb->s_bdev);
1296 atomic_inc(&sb->s_active);
1297 else
1298 sb = NULL;
1299 path_put(&path); 1305 path_put(&path);
1300 return sb; 1306 if (IS_ERR(s)) {
1301}
1302
1303static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1304 const char *dev_name, void *data, struct vfsmount *mnt)
1305{
1306 struct super_block *sb = NULL;
1307 struct gfs2_sbd *sdp;
1308
1309 sb = get_gfs2_sb(dev_name);
1310 if (!sb) {
1311 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1307 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
1312 return -ENOENT; 1308 return PTR_ERR(s);
1313 } 1309 }
1314 sdp = sb->s_fs_info; 1310 sdp = s->s_fs_info;
1315 mnt->mnt_sb = sb; 1311 mnt->mnt_sb = s;
1316 mnt->mnt_root = dget(sdp->sd_master_dir); 1312 mnt->mnt_root = dget(sdp->sd_master_dir);
1317 return 0; 1313 return 0;
1318} 1314}
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1c70fa5168d6..f8bd20baf99c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -262,6 +262,44 @@ out_parent:
262 return error; 262 return error;
263} 263}
264 264
265/*
266 * gfs2_unlink_ok - check to see that a inode is still in a directory
267 * @dip: the directory
268 * @name: the name of the file
269 * @ip: the inode
270 *
271 * Assumes that the lock on (at least) @dip is held.
272 *
273 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
274 */
275
276static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
277 const struct gfs2_inode *ip)
278{
279 int error;
280
281 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
282 return -EPERM;
283
284 if ((dip->i_inode.i_mode & S_ISVTX) &&
285 dip->i_inode.i_uid != current_fsuid() &&
286 ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
287 return -EPERM;
288
289 if (IS_APPEND(&dip->i_inode))
290 return -EPERM;
291
292 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
293 if (error)
294 return error;
295
296 error = gfs2_dir_check(&dip->i_inode, name, ip);
297 if (error)
298 return error;
299
300 return 0;
301}
302
265/** 303/**
266 * gfs2_unlink - Unlink a file 304 * gfs2_unlink - Unlink a file
267 * @dir: The inode of the directory containing the file to unlink 305 * @dir: The inode of the directory containing the file to unlink
@@ -473,6 +511,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
473} 511}
474 512
475/** 513/**
514 * gfs2_rmdiri - Remove a directory
515 * @dip: The parent directory of the directory to be removed
516 * @name: The name of the directory to be removed
517 * @ip: The GFS2 inode of the directory to be removed
518 *
519 * Assumes Glocks on dip and ip are held
520 *
521 * Returns: errno
522 */
523
524static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
525 struct gfs2_inode *ip)
526{
527 struct qstr dotname;
528 int error;
529
530 if (ip->i_entries != 2) {
531 if (gfs2_consist_inode(ip))
532 gfs2_dinode_print(ip);
533 return -EIO;
534 }
535
536 error = gfs2_dir_del(dip, name);
537 if (error)
538 return error;
539
540 error = gfs2_change_nlink(dip, -1);
541 if (error)
542 return error;
543
544 gfs2_str2qstr(&dotname, ".");
545 error = gfs2_dir_del(ip, &dotname);
546 if (error)
547 return error;
548
549 gfs2_str2qstr(&dotname, "..");
550 error = gfs2_dir_del(ip, &dotname);
551 if (error)
552 return error;
553
554 /* It looks odd, but it really should be done twice */
555 error = gfs2_change_nlink(ip, -1);
556 if (error)
557 return error;
558
559 error = gfs2_change_nlink(ip, -1);
560 if (error)
561 return error;
562
563 return error;
564}
565
566/**
476 * gfs2_rmdir - Remove a directory 567 * gfs2_rmdir - Remove a directory
477 * @dir: The parent directory of the directory to be removed 568 * @dir: The parent directory of the directory to be removed
478 * @dentry: The dentry of the directory to remove 569 * @dentry: The dentry of the directory to remove
@@ -885,6 +976,61 @@ out:
885} 976}
886 977
887/** 978/**
979 * gfs2_readlinki - return the contents of a symlink
980 * @ip: the symlink's inode
981 * @buf: a pointer to the buffer to be filled
982 * @len: a pointer to the length of @buf
983 *
984 * If @buf is too small, a piece of memory is kmalloc()ed and needs
985 * to be freed by the caller.
986 *
987 * Returns: errno
988 */
989
990static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
991{
992 struct gfs2_holder i_gh;
993 struct buffer_head *dibh;
994 unsigned int x;
995 int error;
996
997 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
998 error = gfs2_glock_nq(&i_gh);
999 if (error) {
1000 gfs2_holder_uninit(&i_gh);
1001 return error;
1002 }
1003
1004 if (!ip->i_disksize) {
1005 gfs2_consist_inode(ip);
1006 error = -EIO;
1007 goto out;
1008 }
1009
1010 error = gfs2_meta_inode_buffer(ip, &dibh);
1011 if (error)
1012 goto out;
1013
1014 x = ip->i_disksize + 1;
1015 if (x > *len) {
1016 *buf = kmalloc(x, GFP_NOFS);
1017 if (!*buf) {
1018 error = -ENOMEM;
1019 goto out_brelse;
1020 }
1021 }
1022
1023 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1024 *len = x;
1025
1026out_brelse:
1027 brelse(dibh);
1028out:
1029 gfs2_glock_dq_uninit(&i_gh);
1030 return error;
1031}
1032
1033/**
888 * gfs2_readlink - Read the value of a symlink 1034 * gfs2_readlink - Read the value of a symlink
889 * @dentry: the symlink 1035 * @dentry: the symlink
890 * @buf: the buffer to read the symlink data into 1036 * @buf: the buffer to read the symlink data into
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
deleted file mode 100644
index 458019569dcb..000000000000
--- a/fs/gfs2/ops_super.c
+++ /dev/null
@@ -1,723 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/statfs.h>
16#include <linux/seq_file.h>
17#include <linux/mount.h>
18#include <linux/kthread.h>
19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <linux/time.h>
23
24#include "gfs2.h"
25#include "incore.h"
26#include "glock.h"
27#include "inode.h"
28#include "log.h"
29#include "quota.h"
30#include "recovery.h"
31#include "rgrp.h"
32#include "super.h"
33#include "sys.h"
34#include "util.h"
35#include "trans.h"
36#include "dir.h"
37#include "eattr.h"
38#include "bmap.h"
39#include "meta_io.h"
40
41#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
42
43/**
44 * gfs2_write_inode - Make sure the inode is stable on the disk
45 * @inode: The inode
46 * @sync: synchronous write flag
47 *
48 * Returns: errno
49 */
50
51static int gfs2_write_inode(struct inode *inode, int sync)
52{
53 struct gfs2_inode *ip = GFS2_I(inode);
54 struct gfs2_sbd *sdp = GFS2_SB(inode);
55 struct gfs2_holder gh;
56 struct buffer_head *bh;
57 struct timespec atime;
58 struct gfs2_dinode *di;
59 int ret = 0;
60
61 /* Check this is a "normal" inode, etc */
62 if (!test_bit(GIF_USER, &ip->i_flags) ||
63 (current->flags & PF_MEMALLOC))
64 return 0;
65 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
66 if (ret)
67 goto do_flush;
68 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
69 if (ret)
70 goto do_unlock;
71 ret = gfs2_meta_inode_buffer(ip, &bh);
72 if (ret == 0) {
73 di = (struct gfs2_dinode *)bh->b_data;
74 atime.tv_sec = be64_to_cpu(di->di_atime);
75 atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
76 if (timespec_compare(&inode->i_atime, &atime) > 0) {
77 gfs2_trans_add_bh(ip->i_gl, bh, 1);
78 gfs2_dinode_out(ip, bh->b_data);
79 }
80 brelse(bh);
81 }
82 gfs2_trans_end(sdp);
83do_unlock:
84 gfs2_glock_dq_uninit(&gh);
85do_flush:
86 if (sync != 0)
87 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
88 return ret;
89}
90
91/**
92 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
93 * @sdp: the filesystem
94 *
95 * Returns: errno
96 */
97
98static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
99{
100 struct gfs2_holder t_gh;
101 int error;
102
103 gfs2_quota_sync(sdp);
104 gfs2_statfs_sync(sdp);
105
106 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
107 &t_gh);
108 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
109 return error;
110
111 gfs2_meta_syncfs(sdp);
112 gfs2_log_shutdown(sdp);
113
114 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
115
116 if (t_gh.gh_gl)
117 gfs2_glock_dq_uninit(&t_gh);
118
119 gfs2_quota_cleanup(sdp);
120
121 return error;
122}
123
124/**
125 * gfs2_put_super - Unmount the filesystem
126 * @sb: The VFS superblock
127 *
128 */
129
130static void gfs2_put_super(struct super_block *sb)
131{
132 struct gfs2_sbd *sdp = sb->s_fs_info;
133 int error;
134
135 /* Unfreeze the filesystem, if we need to */
136
137 mutex_lock(&sdp->sd_freeze_lock);
138 if (sdp->sd_freeze_count)
139 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
140 mutex_unlock(&sdp->sd_freeze_lock);
141
142 kthread_stop(sdp->sd_quotad_process);
143 kthread_stop(sdp->sd_logd_process);
144 kthread_stop(sdp->sd_recoverd_process);
145
146 if (!(sb->s_flags & MS_RDONLY)) {
147 error = gfs2_make_fs_ro(sdp);
148 if (error)
149 gfs2_io_error(sdp);
150 }
151 /* At this point, we're through modifying the disk */
152
153 /* Release stuff */
154
155 iput(sdp->sd_jindex);
156 iput(sdp->sd_inum_inode);
157 iput(sdp->sd_statfs_inode);
158 iput(sdp->sd_rindex);
159 iput(sdp->sd_quota_inode);
160
161 gfs2_glock_put(sdp->sd_rename_gl);
162 gfs2_glock_put(sdp->sd_trans_gl);
163
164 if (!sdp->sd_args.ar_spectator) {
165 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
166 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
167 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
168 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
169 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
170 iput(sdp->sd_ir_inode);
171 iput(sdp->sd_sc_inode);
172 iput(sdp->sd_qc_inode);
173 }
174
175 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
176 gfs2_clear_rgrpd(sdp);
177 gfs2_jindex_free(sdp);
178 /* Take apart glock structures and buffer lists */
179 gfs2_gl_hash_clear(sdp);
180 /* Unmount the locking protocol */
181 gfs2_lm_unmount(sdp);
182
183 /* At this point, we're through participating in the lockspace */
184 gfs2_sys_fs_del(sdp);
185}
186
187/**
188 * gfs2_write_super
189 * @sb: the superblock
190 *
191 */
192
193static void gfs2_write_super(struct super_block *sb)
194{
195 sb->s_dirt = 0;
196}
197
198/**
199 * gfs2_sync_fs - sync the filesystem
200 * @sb: the superblock
201 *
202 * Flushes the log to disk.
203 */
204
205static int gfs2_sync_fs(struct super_block *sb, int wait)
206{
207 sb->s_dirt = 0;
208 if (wait && sb->s_fs_info)
209 gfs2_log_flush(sb->s_fs_info, NULL);
210 return 0;
211}
212
213/**
214 * gfs2_freeze - prevent further writes to the filesystem
215 * @sb: the VFS structure for the filesystem
216 *
217 */
218
219static int gfs2_freeze(struct super_block *sb)
220{
221 struct gfs2_sbd *sdp = sb->s_fs_info;
222 int error;
223
224 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
225 return -EINVAL;
226
227 for (;;) {
228 error = gfs2_freeze_fs(sdp);
229 if (!error)
230 break;
231
232 switch (error) {
233 case -EBUSY:
234 fs_err(sdp, "waiting for recovery before freeze\n");
235 break;
236
237 default:
238 fs_err(sdp, "error freezing FS: %d\n", error);
239 break;
240 }
241
242 fs_err(sdp, "retrying...\n");
243 msleep(1000);
244 }
245 return 0;
246}
247
248/**
249 * gfs2_unfreeze - reallow writes to the filesystem
250 * @sb: the VFS structure for the filesystem
251 *
252 */
253
254static int gfs2_unfreeze(struct super_block *sb)
255{
256 gfs2_unfreeze_fs(sb->s_fs_info);
257 return 0;
258}
259
260/**
261 * statfs_fill - fill in the sg for a given RG
262 * @rgd: the RG
263 * @sc: the sc structure
264 *
265 * Returns: 0 on success, -ESTALE if the LVB is invalid
266 */
267
268static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
269 struct gfs2_statfs_change_host *sc)
270{
271 gfs2_rgrp_verify(rgd);
272 sc->sc_total += rgd->rd_data;
273 sc->sc_free += rgd->rd_free;
274 sc->sc_dinodes += rgd->rd_dinodes;
275 return 0;
276}
277
278/**
279 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
280 * @sdp: the filesystem
281 * @sc: the sc info that will be returned
282 *
283 * Any error (other than a signal) will cause this routine to fall back
284 * to the synchronous version.
285 *
286 * FIXME: This really shouldn't busy wait like this.
287 *
288 * Returns: errno
289 */
290
291static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
292{
293 struct gfs2_holder ri_gh;
294 struct gfs2_rgrpd *rgd_next;
295 struct gfs2_holder *gha, *gh;
296 unsigned int slots = 64;
297 unsigned int x;
298 int done;
299 int error = 0, err;
300
301 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
302 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
303 if (!gha)
304 return -ENOMEM;
305
306 error = gfs2_rindex_hold(sdp, &ri_gh);
307 if (error)
308 goto out;
309
310 rgd_next = gfs2_rgrpd_get_first(sdp);
311
312 for (;;) {
313 done = 1;
314
315 for (x = 0; x < slots; x++) {
316 gh = gha + x;
317
318 if (gh->gh_gl && gfs2_glock_poll(gh)) {
319 err = gfs2_glock_wait(gh);
320 if (err) {
321 gfs2_holder_uninit(gh);
322 error = err;
323 } else {
324 if (!error)
325 error = statfs_slow_fill(
326 gh->gh_gl->gl_object, sc);
327 gfs2_glock_dq_uninit(gh);
328 }
329 }
330
331 if (gh->gh_gl)
332 done = 0;
333 else if (rgd_next && !error) {
334 error = gfs2_glock_nq_init(rgd_next->rd_gl,
335 LM_ST_SHARED,
336 GL_ASYNC,
337 gh);
338 rgd_next = gfs2_rgrpd_get_next(rgd_next);
339 done = 0;
340 }
341
342 if (signal_pending(current))
343 error = -ERESTARTSYS;
344 }
345
346 if (done)
347 break;
348
349 yield();
350 }
351
352 gfs2_glock_dq_uninit(&ri_gh);
353
354out:
355 kfree(gha);
356 return error;
357}
358
359/**
360 * gfs2_statfs_i - Do a statfs
361 * @sdp: the filesystem
362 * @sg: the sg structure
363 *
364 * Returns: errno
365 */
366
367static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
368{
369 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
370 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
371
372 spin_lock(&sdp->sd_statfs_spin);
373
374 *sc = *m_sc;
375 sc->sc_total += l_sc->sc_total;
376 sc->sc_free += l_sc->sc_free;
377 sc->sc_dinodes += l_sc->sc_dinodes;
378
379 spin_unlock(&sdp->sd_statfs_spin);
380
381 if (sc->sc_free < 0)
382 sc->sc_free = 0;
383 if (sc->sc_free > sc->sc_total)
384 sc->sc_free = sc->sc_total;
385 if (sc->sc_dinodes < 0)
386 sc->sc_dinodes = 0;
387
388 return 0;
389}
390
391/**
392 * gfs2_statfs - Gather and return stats about the filesystem
393 * @sb: The superblock
394 * @statfsbuf: The buffer
395 *
396 * Returns: 0 on success or error code
397 */
398
399static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
400{
401 struct super_block *sb = dentry->d_inode->i_sb;
402 struct gfs2_sbd *sdp = sb->s_fs_info;
403 struct gfs2_statfs_change_host sc;
404 int error;
405
406 if (gfs2_tune_get(sdp, gt_statfs_slow))
407 error = gfs2_statfs_slow(sdp, &sc);
408 else
409 error = gfs2_statfs_i(sdp, &sc);
410
411 if (error)
412 return error;
413
414 buf->f_type = GFS2_MAGIC;
415 buf->f_bsize = sdp->sd_sb.sb_bsize;
416 buf->f_blocks = sc.sc_total;
417 buf->f_bfree = sc.sc_free;
418 buf->f_bavail = sc.sc_free;
419 buf->f_files = sc.sc_dinodes + sc.sc_free;
420 buf->f_ffree = sc.sc_free;
421 buf->f_namelen = GFS2_FNAMESIZE;
422
423 return 0;
424}
425
426/**
427 * gfs2_remount_fs - called when the FS is remounted
428 * @sb: the filesystem
429 * @flags: the remount flags
430 * @data: extra data passed in (not used right now)
431 *
432 * Returns: errno
433 */
434
435static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
436{
437 struct gfs2_sbd *sdp = sb->s_fs_info;
438 struct gfs2_args args = sdp->sd_args; /* Default to current settings */
439 int error;
440
441 error = gfs2_mount_args(sdp, &args, data);
442 if (error)
443 return error;
444
445 /* Not allowed to change locking details */
446 if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
447 strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
448 strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
449 return -EINVAL;
450
451 /* Some flags must not be changed */
452 if (args_neq(&args, &sdp->sd_args, spectator) ||
453 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
454 args_neq(&args, &sdp->sd_args, localflocks) ||
455 args_neq(&args, &sdp->sd_args, localcaching) ||
456 args_neq(&args, &sdp->sd_args, meta))
457 return -EINVAL;
458
459 if (sdp->sd_args.ar_spectator)
460 *flags |= MS_RDONLY;
461
462 if ((sb->s_flags ^ *flags) & MS_RDONLY) {
463 if (*flags & MS_RDONLY)
464 error = gfs2_make_fs_ro(sdp);
465 else
466 error = gfs2_make_fs_rw(sdp);
467 if (error)
468 return error;
469 }
470
471 sdp->sd_args = args;
472 if (sdp->sd_args.ar_posix_acl)
473 sb->s_flags |= MS_POSIXACL;
474 else
475 sb->s_flags &= ~MS_POSIXACL;
476 return 0;
477}
478
479/**
480 * gfs2_drop_inode - Drop an inode (test for remote unlink)
481 * @inode: The inode to drop
482 *
483 * If we've received a callback on an iopen lock then its because a
484 * remote node tried to deallocate the inode but failed due to this node
485 * still having the inode open. Here we mark the link count zero
486 * since we know that it must have reached zero if the GLF_DEMOTE flag
487 * is set on the iopen glock. If we didn't do a disk read since the
488 * remote node removed the final link then we might otherwise miss
489 * this event. This check ensures that this node will deallocate the
490 * inode's blocks, or alternatively pass the baton on to another
491 * node for later deallocation.
492 */
493
494static void gfs2_drop_inode(struct inode *inode)
495{
496 struct gfs2_inode *ip = GFS2_I(inode);
497
498 if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
499 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
500 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
501 clear_nlink(inode);
502 }
503 generic_drop_inode(inode);
504}
505
506/**
507 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
508 * @inode: The VFS inode
509 *
510 */
511
512static void gfs2_clear_inode(struct inode *inode)
513{
514 struct gfs2_inode *ip = GFS2_I(inode);
515
516 /* This tells us its a "real" inode and not one which only
517 * serves to contain an address space (see rgrp.c, meta_io.c)
518 * which therefore doesn't have its own glocks.
519 */
520 if (test_bit(GIF_USER, &ip->i_flags)) {
521 ip->i_gl->gl_object = NULL;
522 gfs2_glock_put(ip->i_gl);
523 ip->i_gl = NULL;
524 if (ip->i_iopen_gh.gh_gl) {
525 ip->i_iopen_gh.gh_gl->gl_object = NULL;
526 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
527 }
528 }
529}
530
531static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
532{
533 do {
534 if (d1 == d2)
535 return 1;
536 d1 = d1->d_parent;
537 } while (!IS_ROOT(d1));
538 return 0;
539}
540
541/**
542 * gfs2_show_options - Show mount options for /proc/mounts
543 * @s: seq_file structure
544 * @mnt: vfsmount
545 *
546 * Returns: 0 on success or error code
547 */
548
549static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
550{
551 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
552 struct gfs2_args *args = &sdp->sd_args;
553
554 if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
555 seq_printf(s, ",meta");
556 if (args->ar_lockproto[0])
557 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
558 if (args->ar_locktable[0])
559 seq_printf(s, ",locktable=%s", args->ar_locktable);
560 if (args->ar_hostdata[0])
561 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
562 if (args->ar_spectator)
563 seq_printf(s, ",spectator");
564 if (args->ar_ignore_local_fs)
565 seq_printf(s, ",ignore_local_fs");
566 if (args->ar_localflocks)
567 seq_printf(s, ",localflocks");
568 if (args->ar_localcaching)
569 seq_printf(s, ",localcaching");
570 if (args->ar_debug)
571 seq_printf(s, ",debug");
572 if (args->ar_upgrade)
573 seq_printf(s, ",upgrade");
574 if (args->ar_posix_acl)
575 seq_printf(s, ",acl");
576 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
577 char *state;
578 switch (args->ar_quota) {
579 case GFS2_QUOTA_OFF:
580 state = "off";
581 break;
582 case GFS2_QUOTA_ACCOUNT:
583 state = "account";
584 break;
585 case GFS2_QUOTA_ON:
586 state = "on";
587 break;
588 default:
589 state = "unknown";
590 break;
591 }
592 seq_printf(s, ",quota=%s", state);
593 }
594 if (args->ar_suiddir)
595 seq_printf(s, ",suiddir");
596 if (args->ar_data != GFS2_DATA_DEFAULT) {
597 char *state;
598 switch (args->ar_data) {
599 case GFS2_DATA_WRITEBACK:
600 state = "writeback";
601 break;
602 case GFS2_DATA_ORDERED:
603 state = "ordered";
604 break;
605 default:
606 state = "unknown";
607 break;
608 }
609 seq_printf(s, ",data=%s", state);
610 }
611 if (args->ar_discard)
612 seq_printf(s, ",discard");
613
614 return 0;
615}
616
617/*
618 * We have to (at the moment) hold the inodes main lock to cover
619 * the gap between unlocking the shared lock on the iopen lock and
620 * taking the exclusive lock. I'd rather do a shared -> exclusive
621 * conversion on the iopen lock, but we can change that later. This
622 * is safe, just less efficient.
623 */
624
625static void gfs2_delete_inode(struct inode *inode)
626{
627 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
628 struct gfs2_inode *ip = GFS2_I(inode);
629 struct gfs2_holder gh;
630 int error;
631
632 if (!test_bit(GIF_USER, &ip->i_flags))
633 goto out;
634
635 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
636 if (unlikely(error)) {
637 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
638 goto out;
639 }
640
641 gfs2_glock_dq_wait(&ip->i_iopen_gh);
642 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
643 error = gfs2_glock_nq(&ip->i_iopen_gh);
644 if (error)
645 goto out_truncate;
646
647 if (S_ISDIR(inode->i_mode) &&
648 (ip->i_diskflags & GFS2_DIF_EXHASH)) {
649 error = gfs2_dir_exhash_dealloc(ip);
650 if (error)
651 goto out_unlock;
652 }
653
654 if (ip->i_eattr) {
655 error = gfs2_ea_dealloc(ip);
656 if (error)
657 goto out_unlock;
658 }
659
660 if (!gfs2_is_stuffed(ip)) {
661 error = gfs2_file_dealloc(ip);
662 if (error)
663 goto out_unlock;
664 }
665
666 error = gfs2_dinode_dealloc(ip);
667 if (error)
668 goto out_unlock;
669
670out_truncate:
671 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
672 if (error)
673 goto out_unlock;
674 /* Needs to be done before glock release & also in a transaction */
675 truncate_inode_pages(&inode->i_data, 0);
676 gfs2_trans_end(sdp);
677
678out_unlock:
679 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
680 gfs2_glock_dq(&ip->i_iopen_gh);
681 gfs2_holder_uninit(&ip->i_iopen_gh);
682 gfs2_glock_dq_uninit(&gh);
683 if (error && error != GLR_TRYFAILED)
684 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
685out:
686 truncate_inode_pages(&inode->i_data, 0);
687 clear_inode(inode);
688}
689
690static struct inode *gfs2_alloc_inode(struct super_block *sb)
691{
692 struct gfs2_inode *ip;
693
694 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
695 if (ip) {
696 ip->i_flags = 0;
697 ip->i_gl = NULL;
698 }
699 return &ip->i_inode;
700}
701
702static void gfs2_destroy_inode(struct inode *inode)
703{
704 kmem_cache_free(gfs2_inode_cachep, inode);
705}
706
707const struct super_operations gfs2_super_ops = {
708 .alloc_inode = gfs2_alloc_inode,
709 .destroy_inode = gfs2_destroy_inode,
710 .write_inode = gfs2_write_inode,
711 .delete_inode = gfs2_delete_inode,
712 .put_super = gfs2_put_super,
713 .write_super = gfs2_write_super,
714 .sync_fs = gfs2_sync_fs,
715 .freeze_fs = gfs2_freeze,
716 .unfreeze_fs = gfs2_unfreeze,
717 .statfs = gfs2_statfs,
718 .remount_fs = gfs2_remount_fs,
719 .clear_inode = gfs2_clear_inode,
720 .drop_inode = gfs2_drop_inode,
721 .show_options = gfs2_show_options,
722};
723
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 152e6c4a0dca..2e9b9326bfc9 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -60,7 +60,6 @@
60#include "super.h" 60#include "super.h"
61#include "trans.h" 61#include "trans.h"
62#include "inode.h" 62#include "inode.h"
63#include "ops_address.h"
64#include "util.h" 63#include "util.h"
65 64
66#define QUOTA_USER 1 65#define QUOTA_USER 1
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 247e8f7d6b3d..59d2695509d3 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,8 +13,7 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/kthread.h> 16#include <linux/slow-work.h>
17#include <linux/freezer.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
20#include "incore.h" 19#include "incore.h"
@@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
441 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 440 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
442} 441}
443 442
444/** 443static int gfs2_recover_get_ref(struct slow_work *work)
445 * gfs2_recover_journal - recover a given journal 444{
446 * @jd: the struct gfs2_jdesc describing the journal 445 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
447 * 446 if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
448 * Acquire the journal's lock, check to see if the journal is clean, and 447 return -EBUSY;
449 * do recovery if necessary. 448 return 0;
450 * 449}
451 * Returns: errno
452 */
453 450
454int gfs2_recover_journal(struct gfs2_jdesc *jd) 451static void gfs2_recover_put_ref(struct slow_work *work)
452{
453 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
454 clear_bit(JDF_RECOVERY, &jd->jd_flags);
455 smp_mb__after_clear_bit();
456 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
457}
458
459static void gfs2_recover_work(struct slow_work *work)
455{ 460{
461 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
456 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 462 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
457 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 463 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
458 struct gfs2_log_header_host head; 464 struct gfs2_log_header_host head;
@@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
569 gfs2_glock_dq_uninit(&j_gh); 575 gfs2_glock_dq_uninit(&j_gh);
570 576
571 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 577 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
572 return 0; 578 return;
573 579
574fail_gunlock_tr: 580fail_gunlock_tr:
575 gfs2_glock_dq_uninit(&t_gh); 581 gfs2_glock_dq_uninit(&t_gh);
@@ -584,70 +590,28 @@ fail_gunlock_j:
584 590
585fail: 591fail:
586 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); 592 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
587 return error;
588} 593}
589 594
590static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) 595struct slow_work_ops gfs2_recover_ops = {
591{ 596 .get_ref = gfs2_recover_get_ref,
592 struct gfs2_jdesc *jd; 597 .put_ref = gfs2_recover_put_ref,
593 int found = 0; 598 .execute = gfs2_recover_work,
594 599};
595 spin_lock(&sdp->sd_jindex_spin);
596 600
597 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
598 if (jd->jd_dirty) {
599 jd->jd_dirty = 0;
600 found = 1;
601 break;
602 }
603 }
604 spin_unlock(&sdp->sd_jindex_spin);
605
606 if (!found)
607 jd = NULL;
608 601
609 return jd; 602static int gfs2_recovery_wait(void *word)
610}
611
612/**
613 * gfs2_check_journals - Recover any dirty journals
614 * @sdp: the filesystem
615 *
616 */
617
618static void gfs2_check_journals(struct gfs2_sbd *sdp)
619{ 603{
620 struct gfs2_jdesc *jd; 604 schedule();
621 605 return 0;
622 for (;;) {
623 jd = gfs2_jdesc_find_dirty(sdp);
624 if (!jd)
625 break;
626
627 if (jd != sdp->sd_jdesc)
628 gfs2_recover_journal(jd);
629 }
630} 606}
631 607
632/** 608int gfs2_recover_journal(struct gfs2_jdesc *jd)
633 * gfs2_recoverd - Recover dead machine's journals
634 * @sdp: Pointer to GFS2 superblock
635 *
636 */
637
638int gfs2_recoverd(void *data)
639{ 609{
640 struct gfs2_sbd *sdp = data; 610 int rv;
641 unsigned long t; 611 rv = slow_work_enqueue(&jd->jd_work);
642 612 if (rv)
643 while (!kthread_should_stop()) { 613 return rv;
644 gfs2_check_journals(sdp); 614 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
645 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
646 if (freezing(current))
647 refrigerator();
648 schedule_timeout_interruptible(t);
649 }
650
651 return 0; 615 return 0;
652} 616}
653 617
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index a8218ea15b57..1616ac22569a 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
28extern int gfs2_find_jhead(struct gfs2_jdesc *jd, 28extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header_host *head); 29 struct gfs2_log_header_host *head);
30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); 30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
31extern int gfs2_recoverd(void *data); 31extern struct slow_work_ops gfs2_recover_ops;
32 32
33#endif /* __RECOVERY_DOT_H__ */ 33#endif /* __RECOVERY_DOT_H__ */
34 34
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa2..daa4ae341a29 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -29,7 +29,7 @@
29#include "util.h" 29#include "util.h"
30#include "log.h" 30#include "log.h"
31#include "inode.h" 31#include "inode.h"
32#include "ops_address.h" 32#include "trace_gfs2.h"
33 33
34#define BFITNOENT ((u32)~0) 34#define BFITNOENT ((u32)~0)
35#define NO_BLOCK ((u64)~0) 35#define NO_BLOCK ((u64)~0)
@@ -442,6 +442,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
442 for (x = 0; x < length; x++) { 442 for (x = 0; x < length; x++) {
443 bi = rgd->rd_bits + x; 443 bi = rgd->rd_bits + x;
444 444
445 bi->bi_flags = 0;
445 /* small rgrp; bitmap stored completely in header block */ 446 /* small rgrp; bitmap stored completely in header block */
446 if (length == 1) { 447 if (length == 1) {
447 bytes = bytes_left; 448 bytes = bytes_left;
@@ -580,7 +581,6 @@ static int read_rindex_entry(struct gfs2_inode *ip,
580 581
581 rgd->rd_gl->gl_object = rgd; 582 rgd->rd_gl->gl_object = rgd;
582 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 583 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
583 rgd->rd_flags |= GFS2_RDF_CHECK;
584 return error; 584 return error;
585} 585}
586 586
@@ -701,10 +701,9 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
701 u32 rg_flags; 701 u32 rg_flags;
702 702
703 rg_flags = be32_to_cpu(str->rg_flags); 703 rg_flags = be32_to_cpu(str->rg_flags);
704 if (rg_flags & GFS2_RGF_NOALLOC) 704 rg_flags &= ~GFS2_RDF_MASK;
705 rgd->rd_flags |= GFS2_RDF_NOALLOC; 705 rgd->rd_flags &= GFS2_RDF_MASK;
706 else 706 rgd->rd_flags |= rg_flags;
707 rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
708 rgd->rd_free = be32_to_cpu(str->rg_free); 707 rgd->rd_free = be32_to_cpu(str->rg_free);
709 rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); 708 rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
710 rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); 709 rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -713,11 +712,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
713static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) 712static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
714{ 713{
715 struct gfs2_rgrp *str = buf; 714 struct gfs2_rgrp *str = buf;
716 u32 rg_flags = 0;
717 715
718 if (rgd->rd_flags & GFS2_RDF_NOALLOC) 716 str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
719 rg_flags |= GFS2_RGF_NOALLOC;
720 str->rg_flags = cpu_to_be32(rg_flags);
721 str->rg_free = cpu_to_be32(rgd->rd_free); 717 str->rg_free = cpu_to_be32(rgd->rd_free);
722 str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); 718 str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
723 str->__pad = cpu_to_be32(0); 719 str->__pad = cpu_to_be32(0);
@@ -775,8 +771,10 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
775 } 771 }
776 772
777 if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { 773 if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
774 for (x = 0; x < length; x++)
775 clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
778 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); 776 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
779 rgd->rd_flags |= GFS2_RDF_UPTODATE; 777 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
780 } 778 }
781 779
782 spin_lock(&sdp->sd_rindex_spin); 780 spin_lock(&sdp->sd_rindex_spin);
@@ -845,7 +843,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
845 struct super_block *sb = sdp->sd_vfs; 843 struct super_block *sb = sdp->sd_vfs;
846 struct block_device *bdev = sb->s_bdev; 844 struct block_device *bdev = sb->s_bdev;
847 const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / 845 const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
848 bdev_hardsect_size(sb->s_bdev); 846 bdev_logical_block_size(sb->s_bdev);
849 u64 blk; 847 u64 blk;
850 sector_t start = 0; 848 sector_t start = 0;
851 sector_t nr_sects = 0; 849 sector_t nr_sects = 0;
@@ -903,6 +901,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
903 continue; 901 continue;
904 if (sdp->sd_args.ar_discard) 902 if (sdp->sd_args.ar_discard)
905 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi); 903 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
904 clear_bit(GBF_FULL, &bi->bi_flags);
906 memcpy(bi->bi_clone + bi->bi_offset, 905 memcpy(bi->bi_clone + bi->bi_offset,
907 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); 906 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
908 } 907 }
@@ -942,7 +941,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
942 struct gfs2_sbd *sdp = rgd->rd_sbd; 941 struct gfs2_sbd *sdp = rgd->rd_sbd;
943 int ret = 0; 942 int ret = 0;
944 943
945 if (rgd->rd_flags & GFS2_RDF_NOALLOC) 944 if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
946 return 0; 945 return 0;
947 946
948 spin_lock(&sdp->sd_rindex_spin); 947 spin_lock(&sdp->sd_rindex_spin);
@@ -1315,30 +1314,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1315{ 1314{
1316 struct gfs2_bitmap *bi = NULL; 1315 struct gfs2_bitmap *bi = NULL;
1317 const u32 length = rgd->rd_length; 1316 const u32 length = rgd->rd_length;
1318 u32 blk = 0; 1317 u32 blk = BFITNOENT;
1319 unsigned int buf, x; 1318 unsigned int buf, x;
1320 const unsigned int elen = *n; 1319 const unsigned int elen = *n;
1321 const u8 *buffer; 1320 const u8 *buffer = NULL;
1322 1321
1323 *n = 0; 1322 *n = 0;
1324 /* Find bitmap block that contains bits for goal block */ 1323 /* Find bitmap block that contains bits for goal block */
1325 for (buf = 0; buf < length; buf++) { 1324 for (buf = 0; buf < length; buf++) {
1326 bi = rgd->rd_bits + buf; 1325 bi = rgd->rd_bits + buf;
1327 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) 1326 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1328 break; 1327 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
1328 goal -= bi->bi_start * GFS2_NBBY;
1329 goto do_search;
1330 }
1329 } 1331 }
1332 buf = 0;
1333 goal = 0;
1330 1334
1331 gfs2_assert(rgd->rd_sbd, buf < length); 1335do_search:
1332
1333 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1334 goal -= bi->bi_start * GFS2_NBBY;
1335
1336 /* Search (up to entire) bitmap in this rgrp for allocatable block. 1336 /* Search (up to entire) bitmap in this rgrp for allocatable block.
1337 "x <= length", instead of "x < length", because we typically start 1337 "x <= length", instead of "x < length", because we typically start
1338 the search in the middle of a bit block, but if we can't find an 1338 the search in the middle of a bit block, but if we can't find an
1339 allocatable block anywhere else, we want to be able wrap around and 1339 allocatable block anywhere else, we want to be able wrap around and
1340 search in the first part of our first-searched bit block. */ 1340 search in the first part of our first-searched bit block. */
1341 for (x = 0; x <= length; x++) { 1341 for (x = 0; x <= length; x++) {
1342 bi = rgd->rd_bits + buf;
1343
1344 if (test_bit(GBF_FULL, &bi->bi_flags) &&
1345 (old_state == GFS2_BLKST_FREE))
1346 goto skip;
1347
1342 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone 1348 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1343 bitmaps, so we must search the originals for that. */ 1349 bitmaps, so we must search the originals for that. */
1344 buffer = bi->bi_bh->b_data + bi->bi_offset; 1350 buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1349,33 +1355,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1349 if (blk != BFITNOENT) 1355 if (blk != BFITNOENT)
1350 break; 1356 break;
1351 1357
1358 if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
1359 set_bit(GBF_FULL, &bi->bi_flags);
1360
1352 /* Try next bitmap block (wrap back to rgrp header if at end) */ 1361 /* Try next bitmap block (wrap back to rgrp header if at end) */
1353 buf = (buf + 1) % length; 1362skip:
1354 bi = rgd->rd_bits + buf; 1363 buf++;
1364 buf %= length;
1355 goal = 0; 1365 goal = 0;
1356 } 1366 }
1357 1367
1358 if (blk != BFITNOENT && old_state != new_state) { 1368 if (blk == BFITNOENT)
1359 *n = 1; 1369 return blk;
1360 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); 1370 *n = 1;
1371 if (old_state == new_state)
1372 goto out;
1373
1374 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1375 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
1376 bi->bi_len, blk, new_state);
1377 goal = blk;
1378 while (*n < elen) {
1379 goal++;
1380 if (goal >= (bi->bi_len * GFS2_NBBY))
1381 break;
1382 if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
1383 GFS2_BLKST_FREE)
1384 break;
1361 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, 1385 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
1362 bi->bi_len, blk, new_state); 1386 bi->bi_len, goal, new_state);
1363 goal = blk; 1387 (*n)++;
1364 while (*n < elen) {
1365 goal++;
1366 if (goal >= (bi->bi_len * GFS2_NBBY))
1367 break;
1368 if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
1369 GFS2_BLKST_FREE)
1370 break;
1371 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
1372 bi->bi_offset, bi->bi_len, goal,
1373 new_state);
1374 (*n)++;
1375 }
1376 } 1388 }
1377 1389out:
1378 return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk; 1390 return (bi->bi_start * GFS2_NBBY) + blk;
1379} 1391}
1380 1392
1381/** 1393/**
@@ -1435,13 +1447,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1435} 1447}
1436 1448
1437/** 1449/**
1438 * gfs2_alloc_block - Allocate a block 1450 * gfs2_rgrp_dump - print out an rgrp
1451 * @seq: The iterator
1452 * @gl: The glock in question
1453 *
1454 */
1455
1456int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
1457{
1458 const struct gfs2_rgrpd *rgd = gl->gl_object;
1459 if (rgd == NULL)
1460 return 0;
1461 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
1462 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
1463 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
1464 return 0;
1465}
1466
1467/**
1468 * gfs2_alloc_block - Allocate one or more blocks
1439 * @ip: the inode to allocate the block for 1469 * @ip: the inode to allocate the block for
1470 * @bn: Used to return the starting block number
1471 * @n: requested number of blocks/extent length (value/result)
1440 * 1472 *
1441 * Returns: the allocated block 1473 * Returns: 0 or error
1442 */ 1474 */
1443 1475
1444u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) 1476int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1445{ 1477{
1446 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1478 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1447 struct buffer_head *dibh; 1479 struct buffer_head *dibh;
@@ -1457,7 +1489,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1457 goal = rgd->rd_last_alloc; 1489 goal = rgd->rd_last_alloc;
1458 1490
1459 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n); 1491 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
1460 BUG_ON(blk == BFITNOENT); 1492
1493 /* Since all blocks are reserved in advance, this shouldn't happen */
1494 if (blk == BFITNOENT)
1495 goto rgrp_error;
1461 1496
1462 rgd->rd_last_alloc = blk; 1497 rgd->rd_last_alloc = blk;
1463 block = rgd->rd_data0 + blk; 1498 block = rgd->rd_data0 + blk;
@@ -1469,7 +1504,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1469 di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); 1504 di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
1470 brelse(dibh); 1505 brelse(dibh);
1471 } 1506 }
1472 gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); 1507 if (rgd->rd_free < *n)
1508 goto rgrp_error;
1509
1473 rgd->rd_free -= *n; 1510 rgd->rd_free -= *n;
1474 1511
1475 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1512 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1483,8 +1520,17 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1483 spin_lock(&sdp->sd_rindex_spin); 1520 spin_lock(&sdp->sd_rindex_spin);
1484 rgd->rd_free_clone -= *n; 1521 rgd->rd_free_clone -= *n;
1485 spin_unlock(&sdp->sd_rindex_spin); 1522 spin_unlock(&sdp->sd_rindex_spin);
1523 trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
1524 *bn = block;
1525 return 0;
1486 1526
1487 return block; 1527rgrp_error:
1528 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
1529 (unsigned long long)rgd->rd_addr);
1530 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
1531 gfs2_rgrp_dump(NULL, rgd->rd_gl);
1532 rgd->rd_flags |= GFS2_RDF_ERROR;
1533 return -EIO;
1488} 1534}
1489 1535
1490/** 1536/**
@@ -1526,7 +1572,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1526 spin_lock(&sdp->sd_rindex_spin); 1572 spin_lock(&sdp->sd_rindex_spin);
1527 rgd->rd_free_clone--; 1573 rgd->rd_free_clone--;
1528 spin_unlock(&sdp->sd_rindex_spin); 1574 spin_unlock(&sdp->sd_rindex_spin);
1529 1575 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
1530 return block; 1576 return block;
1531} 1577}
1532 1578
@@ -1546,7 +1592,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1546 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); 1592 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1547 if (!rgd) 1593 if (!rgd)
1548 return; 1594 return;
1549 1595 trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
1550 rgd->rd_free += blen; 1596 rgd->rd_free += blen;
1551 1597
1552 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1598 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1574,7 +1620,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1574 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); 1620 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1575 if (!rgd) 1621 if (!rgd)
1576 return; 1622 return;
1577 1623 trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
1578 rgd->rd_free += blen; 1624 rgd->rd_free += blen;
1579 1625
1580 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1626 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1597,6 +1643,7 @@ void gfs2_unlink_di(struct inode *inode)
1597 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); 1643 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
1598 if (!rgd) 1644 if (!rgd)
1599 return; 1645 return;
1646 trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
1600 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1647 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1601 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1648 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1602 gfs2_trans_add_rg(rgd); 1649 gfs2_trans_add_rg(rgd);
@@ -1628,6 +1675,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1628void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) 1675void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1629{ 1676{
1630 gfs2_free_uninit_di(rgd, ip->i_no_addr); 1677 gfs2_free_uninit_di(rgd, ip->i_no_addr);
1678 trace_gfs2_block_alloc(ip, ip->i_no_addr, 1, GFS2_BLKST_FREE);
1631 gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); 1679 gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
1632 gfs2_meta_wipe(ip, ip->i_no_addr, 1); 1680 gfs2_meta_wipe(ip, ip->i_no_addr, 1);
1633} 1681}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3181c7e624bf..1e76ff0f3e00 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -14,22 +14,22 @@ struct gfs2_rgrpd;
14struct gfs2_sbd; 14struct gfs2_sbd;
15struct gfs2_holder; 15struct gfs2_holder;
16 16
17void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); 17extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
18 18
19struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); 19struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
20struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); 20struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
21struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); 21struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
22 22
23void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); 23extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
24int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); 24extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
25 25
26int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); 26extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
27void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); 27extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
28void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); 28extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
29 29
30void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); 30extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
31 31
32struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); 32extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
33static inline void gfs2_alloc_put(struct gfs2_inode *ip) 33static inline void gfs2_alloc_put(struct gfs2_inode *ip)
34{ 34{
35 BUG_ON(ip->i_alloc == NULL); 35 BUG_ON(ip->i_alloc == NULL);
@@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
37 ip->i_alloc = NULL; 37 ip->i_alloc = NULL;
38} 38}
39 39
40int gfs2_inplace_reserve_i(struct gfs2_inode *ip, 40extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
41 char *file, unsigned int line); 41 unsigned int line);
42#define gfs2_inplace_reserve(ip) \ 42#define gfs2_inplace_reserve(ip) \
43gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) 43gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
44 44
45void gfs2_inplace_release(struct gfs2_inode *ip); 45extern void gfs2_inplace_release(struct gfs2_inode *ip);
46 46
47unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); 47extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
48 48
49u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n); 49extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
50u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); 50extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
51 51
52void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); 52extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
53void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); 53extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
54void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); 54extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
55void gfs2_unlink_di(struct inode *inode); 55extern void gfs2_unlink_di(struct inode *inode);
56 56
57struct gfs2_rgrp_list { 57struct gfs2_rgrp_list {
58 unsigned int rl_rgrps; 58 unsigned int rl_rgrps;
@@ -61,10 +61,11 @@ struct gfs2_rgrp_list {
61 struct gfs2_holder *rl_ghs; 61 struct gfs2_holder *rl_ghs;
62}; 62};
63 63
64void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, 64extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
65 u64 block); 65 u64 block);
66void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); 66extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
67void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 67extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
68u64 gfs2_ri_total(struct gfs2_sbd *sdp); 68extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
69extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
69 70
70#endif /* __RGRP_DOT_H__ */ 71#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 601913e0a482..0a6801336470 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,14 +7,20 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/bio.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/slab.h> 12#include <linux/slab.h>
12#include <linux/spinlock.h> 13#include <linux/spinlock.h>
13#include <linux/completion.h> 14#include <linux/completion.h>
14#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
15#include <linux/crc32.h> 16#include <linux/statfs.h>
17#include <linux/seq_file.h>
18#include <linux/mount.h>
19#include <linux/kthread.h>
20#include <linux/delay.h>
16#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h> 22#include <linux/crc32.h>
23#include <linux/time.h>
18 24
19#include "gfs2.h" 25#include "gfs2.h"
20#include "incore.h" 26#include "incore.h"
@@ -31,6 +37,183 @@
31#include "super.h" 37#include "super.h"
32#include "trans.h" 38#include "trans.h"
33#include "util.h" 39#include "util.h"
40#include "sys.h"
41#include "eattr.h"
42
43#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
44
45enum {
46 Opt_lockproto,
47 Opt_locktable,
48 Opt_hostdata,
49 Opt_spectator,
50 Opt_ignore_local_fs,
51 Opt_localflocks,
52 Opt_localcaching,
53 Opt_debug,
54 Opt_nodebug,
55 Opt_upgrade,
56 Opt_acl,
57 Opt_noacl,
58 Opt_quota_off,
59 Opt_quota_account,
60 Opt_quota_on,
61 Opt_quota,
62 Opt_noquota,
63 Opt_suiddir,
64 Opt_nosuiddir,
65 Opt_data_writeback,
66 Opt_data_ordered,
67 Opt_meta,
68 Opt_discard,
69 Opt_nodiscard,
70 Opt_commit,
71 Opt_error,
72};
73
74static const match_table_t tokens = {
75 {Opt_lockproto, "lockproto=%s"},
76 {Opt_locktable, "locktable=%s"},
77 {Opt_hostdata, "hostdata=%s"},
78 {Opt_spectator, "spectator"},
79 {Opt_ignore_local_fs, "ignore_local_fs"},
80 {Opt_localflocks, "localflocks"},
81 {Opt_localcaching, "localcaching"},
82 {Opt_debug, "debug"},
83 {Opt_nodebug, "nodebug"},
84 {Opt_upgrade, "upgrade"},
85 {Opt_acl, "acl"},
86 {Opt_noacl, "noacl"},
87 {Opt_quota_off, "quota=off"},
88 {Opt_quota_account, "quota=account"},
89 {Opt_quota_on, "quota=on"},
90 {Opt_quota, "quota"},
91 {Opt_noquota, "noquota"},
92 {Opt_suiddir, "suiddir"},
93 {Opt_nosuiddir, "nosuiddir"},
94 {Opt_data_writeback, "data=writeback"},
95 {Opt_data_ordered, "data=ordered"},
96 {Opt_meta, "meta"},
97 {Opt_discard, "discard"},
98 {Opt_nodiscard, "nodiscard"},
99 {Opt_commit, "commit=%d"},
100 {Opt_error, NULL}
101};
102
103/**
104 * gfs2_mount_args - Parse mount options
105 * @sdp:
106 * @data:
107 *
108 * Return: errno
109 */
110
111int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
112{
113 char *o;
114 int token;
115 substring_t tmp[MAX_OPT_ARGS];
116 int rv;
117
118 /* Split the options into tokens with the "," character and
119 process them */
120
121 while (1) {
122 o = strsep(&options, ",");
123 if (o == NULL)
124 break;
125 if (*o == '\0')
126 continue;
127
128 token = match_token(o, tokens, tmp);
129 switch (token) {
130 case Opt_lockproto:
131 match_strlcpy(args->ar_lockproto, &tmp[0],
132 GFS2_LOCKNAME_LEN);
133 break;
134 case Opt_locktable:
135 match_strlcpy(args->ar_locktable, &tmp[0],
136 GFS2_LOCKNAME_LEN);
137 break;
138 case Opt_hostdata:
139 match_strlcpy(args->ar_hostdata, &tmp[0],
140 GFS2_LOCKNAME_LEN);
141 break;
142 case Opt_spectator:
143 args->ar_spectator = 1;
144 break;
145 case Opt_ignore_local_fs:
146 args->ar_ignore_local_fs = 1;
147 break;
148 case Opt_localflocks:
149 args->ar_localflocks = 1;
150 break;
151 case Opt_localcaching:
152 args->ar_localcaching = 1;
153 break;
154 case Opt_debug:
155 args->ar_debug = 1;
156 break;
157 case Opt_nodebug:
158 args->ar_debug = 0;
159 break;
160 case Opt_upgrade:
161 args->ar_upgrade = 1;
162 break;
163 case Opt_acl:
164 args->ar_posix_acl = 1;
165 break;
166 case Opt_noacl:
167 args->ar_posix_acl = 0;
168 break;
169 case Opt_quota_off:
170 case Opt_noquota:
171 args->ar_quota = GFS2_QUOTA_OFF;
172 break;
173 case Opt_quota_account:
174 args->ar_quota = GFS2_QUOTA_ACCOUNT;
175 break;
176 case Opt_quota_on:
177 case Opt_quota:
178 args->ar_quota = GFS2_QUOTA_ON;
179 break;
180 case Opt_suiddir:
181 args->ar_suiddir = 1;
182 break;
183 case Opt_nosuiddir:
184 args->ar_suiddir = 0;
185 break;
186 case Opt_data_writeback:
187 args->ar_data = GFS2_DATA_WRITEBACK;
188 break;
189 case Opt_data_ordered:
190 args->ar_data = GFS2_DATA_ORDERED;
191 break;
192 case Opt_meta:
193 args->ar_meta = 1;
194 break;
195 case Opt_discard:
196 args->ar_discard = 1;
197 break;
198 case Opt_nodiscard:
199 args->ar_discard = 0;
200 break;
201 case Opt_commit:
202 rv = match_int(&tmp[0], &args->ar_commit);
203 if (rv || args->ar_commit <= 0) {
204 fs_info(sdp, "commit mount option requires a positive numeric argument\n");
205 return rv ? rv : -EINVAL;
206 }
207 break;
208 case Opt_error:
209 default:
210 fs_info(sdp, "invalid mount option: %s\n", o);
211 return -EINVAL;
212 }
213 }
214
215 return 0;
216}
34 217
35/** 218/**
36 * gfs2_jindex_free - Clear all the journal index information 219 * gfs2_jindex_free - Clear all the journal index information
@@ -436,3 +619,706 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
436 mutex_unlock(&sdp->sd_freeze_lock); 619 mutex_unlock(&sdp->sd_freeze_lock);
437} 620}
438 621
622
623/**
624 * gfs2_write_inode - Make sure the inode is stable on the disk
625 * @inode: The inode
626 * @sync: synchronous write flag
627 *
628 * Returns: errno
629 */
630
631static int gfs2_write_inode(struct inode *inode, int sync)
632{
633 struct gfs2_inode *ip = GFS2_I(inode);
634 struct gfs2_sbd *sdp = GFS2_SB(inode);
635 struct gfs2_holder gh;
636 struct buffer_head *bh;
637 struct timespec atime;
638 struct gfs2_dinode *di;
639 int ret = 0;
640
641 /* Check this is a "normal" inode, etc */
642 if (!test_bit(GIF_USER, &ip->i_flags) ||
643 (current->flags & PF_MEMALLOC))
644 return 0;
645 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
646 if (ret)
647 goto do_flush;
648 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
649 if (ret)
650 goto do_unlock;
651 ret = gfs2_meta_inode_buffer(ip, &bh);
652 if (ret == 0) {
653 di = (struct gfs2_dinode *)bh->b_data;
654 atime.tv_sec = be64_to_cpu(di->di_atime);
655 atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
656 if (timespec_compare(&inode->i_atime, &atime) > 0) {
657 gfs2_trans_add_bh(ip->i_gl, bh, 1);
658 gfs2_dinode_out(ip, bh->b_data);
659 }
660 brelse(bh);
661 }
662 gfs2_trans_end(sdp);
663do_unlock:
664 gfs2_glock_dq_uninit(&gh);
665do_flush:
666 if (sync != 0)
667 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
668 return ret;
669}
670
671/**
672 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
673 * @sdp: the filesystem
674 *
675 * Returns: errno
676 */
677
678static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
679{
680 struct gfs2_holder t_gh;
681 int error;
682
683 gfs2_quota_sync(sdp);
684 gfs2_statfs_sync(sdp);
685
686 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
687 &t_gh);
688 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
689 return error;
690
691 gfs2_meta_syncfs(sdp);
692 gfs2_log_shutdown(sdp);
693
694 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
695
696 if (t_gh.gh_gl)
697 gfs2_glock_dq_uninit(&t_gh);
698
699 gfs2_quota_cleanup(sdp);
700
701 return error;
702}
703
704static int gfs2_umount_recovery_wait(void *word)
705{
706 schedule();
707 return 0;
708}
709
710/**
711 * gfs2_put_super - Unmount the filesystem
712 * @sb: The VFS superblock
713 *
714 */
715
716static void gfs2_put_super(struct super_block *sb)
717{
718 struct gfs2_sbd *sdp = sb->s_fs_info;
719 int error;
720 struct gfs2_jdesc *jd;
721
722 /* Unfreeze the filesystem, if we need to */
723
724 mutex_lock(&sdp->sd_freeze_lock);
725 if (sdp->sd_freeze_count)
726 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
727 mutex_unlock(&sdp->sd_freeze_lock);
728
729 /* No more recovery requests */
730 set_bit(SDF_NORECOVERY, &sdp->sd_flags);
731 smp_mb();
732
733 /* Wait on outstanding recovery */
734restart:
735 spin_lock(&sdp->sd_jindex_spin);
736 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
737 if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
738 continue;
739 spin_unlock(&sdp->sd_jindex_spin);
740 wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
741 gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
742 goto restart;
743 }
744 spin_unlock(&sdp->sd_jindex_spin);
745
746 kthread_stop(sdp->sd_quotad_process);
747 kthread_stop(sdp->sd_logd_process);
748
749 if (!(sb->s_flags & MS_RDONLY)) {
750 error = gfs2_make_fs_ro(sdp);
751 if (error)
752 gfs2_io_error(sdp);
753 }
754 /* At this point, we're through modifying the disk */
755
756 /* Release stuff */
757
758 iput(sdp->sd_jindex);
759 iput(sdp->sd_inum_inode);
760 iput(sdp->sd_statfs_inode);
761 iput(sdp->sd_rindex);
762 iput(sdp->sd_quota_inode);
763
764 gfs2_glock_put(sdp->sd_rename_gl);
765 gfs2_glock_put(sdp->sd_trans_gl);
766
767 if (!sdp->sd_args.ar_spectator) {
768 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
769 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
770 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
771 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
772 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
773 iput(sdp->sd_ir_inode);
774 iput(sdp->sd_sc_inode);
775 iput(sdp->sd_qc_inode);
776 }
777
778 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
779 gfs2_clear_rgrpd(sdp);
780 gfs2_jindex_free(sdp);
781 /* Take apart glock structures and buffer lists */
782 gfs2_gl_hash_clear(sdp);
783 /* Unmount the locking protocol */
784 gfs2_lm_unmount(sdp);
785
786 /* At this point, we're through participating in the lockspace */
787 gfs2_sys_fs_del(sdp);
788}
789
790/**
791 * gfs2_sync_fs - sync the filesystem
792 * @sb: the superblock
793 *
794 * Flushes the log to disk.
795 */
796
797static int gfs2_sync_fs(struct super_block *sb, int wait)
798{
799 if (wait && sb->s_fs_info)
800 gfs2_log_flush(sb->s_fs_info, NULL);
801 return 0;
802}
803
804/**
805 * gfs2_freeze - prevent further writes to the filesystem
806 * @sb: the VFS structure for the filesystem
807 *
808 */
809
810static int gfs2_freeze(struct super_block *sb)
811{
812 struct gfs2_sbd *sdp = sb->s_fs_info;
813 int error;
814
815 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
816 return -EINVAL;
817
818 for (;;) {
819 error = gfs2_freeze_fs(sdp);
820 if (!error)
821 break;
822
823 switch (error) {
824 case -EBUSY:
825 fs_err(sdp, "waiting for recovery before freeze\n");
826 break;
827
828 default:
829 fs_err(sdp, "error freezing FS: %d\n", error);
830 break;
831 }
832
833 fs_err(sdp, "retrying...\n");
834 msleep(1000);
835 }
836 return 0;
837}
838
839/**
840 * gfs2_unfreeze - reallow writes to the filesystem
841 * @sb: the VFS structure for the filesystem
842 *
843 */
844
845static int gfs2_unfreeze(struct super_block *sb)
846{
847 gfs2_unfreeze_fs(sb->s_fs_info);
848 return 0;
849}
850
851/**
852 * statfs_fill - fill in the sg for a given RG
853 * @rgd: the RG
854 * @sc: the sc structure
855 *
856 * Returns: 0 on success, -ESTALE if the LVB is invalid
857 */
858
859static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
860 struct gfs2_statfs_change_host *sc)
861{
862 gfs2_rgrp_verify(rgd);
863 sc->sc_total += rgd->rd_data;
864 sc->sc_free += rgd->rd_free;
865 sc->sc_dinodes += rgd->rd_dinodes;
866 return 0;
867}
868
869/**
870 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
871 * @sdp: the filesystem
872 * @sc: the sc info that will be returned
873 *
874 * Any error (other than a signal) will cause this routine to fall back
875 * to the synchronous version.
876 *
877 * FIXME: This really shouldn't busy wait like this.
878 *
879 * Returns: errno
880 */
881
882static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
883{
884 struct gfs2_holder ri_gh;
885 struct gfs2_rgrpd *rgd_next;
886 struct gfs2_holder *gha, *gh;
887 unsigned int slots = 64;
888 unsigned int x;
889 int done;
890 int error = 0, err;
891
892 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
893 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
894 if (!gha)
895 return -ENOMEM;
896
897 error = gfs2_rindex_hold(sdp, &ri_gh);
898 if (error)
899 goto out;
900
901 rgd_next = gfs2_rgrpd_get_first(sdp);
902
903 for (;;) {
904 done = 1;
905
906 for (x = 0; x < slots; x++) {
907 gh = gha + x;
908
909 if (gh->gh_gl && gfs2_glock_poll(gh)) {
910 err = gfs2_glock_wait(gh);
911 if (err) {
912 gfs2_holder_uninit(gh);
913 error = err;
914 } else {
915 if (!error)
916 error = statfs_slow_fill(
917 gh->gh_gl->gl_object, sc);
918 gfs2_glock_dq_uninit(gh);
919 }
920 }
921
922 if (gh->gh_gl)
923 done = 0;
924 else if (rgd_next && !error) {
925 error = gfs2_glock_nq_init(rgd_next->rd_gl,
926 LM_ST_SHARED,
927 GL_ASYNC,
928 gh);
929 rgd_next = gfs2_rgrpd_get_next(rgd_next);
930 done = 0;
931 }
932
933 if (signal_pending(current))
934 error = -ERESTARTSYS;
935 }
936
937 if (done)
938 break;
939
940 yield();
941 }
942
943 gfs2_glock_dq_uninit(&ri_gh);
944
945out:
946 kfree(gha);
947 return error;
948}
949
950/**
951 * gfs2_statfs_i - Do a statfs
952 * @sdp: the filesystem
953 * @sg: the sg structure
954 *
955 * Returns: errno
956 */
957
958static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
959{
960 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
961 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
962
963 spin_lock(&sdp->sd_statfs_spin);
964
965 *sc = *m_sc;
966 sc->sc_total += l_sc->sc_total;
967 sc->sc_free += l_sc->sc_free;
968 sc->sc_dinodes += l_sc->sc_dinodes;
969
970 spin_unlock(&sdp->sd_statfs_spin);
971
972 if (sc->sc_free < 0)
973 sc->sc_free = 0;
974 if (sc->sc_free > sc->sc_total)
975 sc->sc_free = sc->sc_total;
976 if (sc->sc_dinodes < 0)
977 sc->sc_dinodes = 0;
978
979 return 0;
980}
981
982/**
983 * gfs2_statfs - Gather and return stats about the filesystem
984 * @sb: The superblock
985 * @statfsbuf: The buffer
986 *
987 * Returns: 0 on success or error code
988 */
989
990static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
991{
992 struct super_block *sb = dentry->d_inode->i_sb;
993 struct gfs2_sbd *sdp = sb->s_fs_info;
994 struct gfs2_statfs_change_host sc;
995 int error;
996
997 if (gfs2_tune_get(sdp, gt_statfs_slow))
998 error = gfs2_statfs_slow(sdp, &sc);
999 else
1000 error = gfs2_statfs_i(sdp, &sc);
1001
1002 if (error)
1003 return error;
1004
1005 buf->f_type = GFS2_MAGIC;
1006 buf->f_bsize = sdp->sd_sb.sb_bsize;
1007 buf->f_blocks = sc.sc_total;
1008 buf->f_bfree = sc.sc_free;
1009 buf->f_bavail = sc.sc_free;
1010 buf->f_files = sc.sc_dinodes + sc.sc_free;
1011 buf->f_ffree = sc.sc_free;
1012 buf->f_namelen = GFS2_FNAMESIZE;
1013
1014 return 0;
1015}
1016
1017/**
1018 * gfs2_remount_fs - called when the FS is remounted
1019 * @sb: the filesystem
1020 * @flags: the remount flags
1021 * @data: extra data passed in (not used right now)
1022 *
1023 * Returns: errno
1024 */
1025
1026static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1027{
1028 struct gfs2_sbd *sdp = sb->s_fs_info;
1029 struct gfs2_args args = sdp->sd_args; /* Default to current settings */
1030 struct gfs2_tune *gt = &sdp->sd_tune;
1031 int error;
1032
1033 spin_lock(&gt->gt_spin);
1034 args.ar_commit = gt->gt_log_flush_secs;
1035 spin_unlock(&gt->gt_spin);
1036 error = gfs2_mount_args(sdp, &args, data);
1037 if (error)
1038 return error;
1039
1040 /* Not allowed to change locking details */
1041 if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
1042 strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
1043 strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
1044 return -EINVAL;
1045
1046 /* Some flags must not be changed */
1047 if (args_neq(&args, &sdp->sd_args, spectator) ||
1048 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
1049 args_neq(&args, &sdp->sd_args, localflocks) ||
1050 args_neq(&args, &sdp->sd_args, localcaching) ||
1051 args_neq(&args, &sdp->sd_args, meta))
1052 return -EINVAL;
1053
1054 if (sdp->sd_args.ar_spectator)
1055 *flags |= MS_RDONLY;
1056
1057 if ((sb->s_flags ^ *flags) & MS_RDONLY) {
1058 if (*flags & MS_RDONLY)
1059 error = gfs2_make_fs_ro(sdp);
1060 else
1061 error = gfs2_make_fs_rw(sdp);
1062 if (error)
1063 return error;
1064 }
1065
1066 sdp->sd_args = args;
1067 if (sdp->sd_args.ar_posix_acl)
1068 sb->s_flags |= MS_POSIXACL;
1069 else
1070 sb->s_flags &= ~MS_POSIXACL;
1071 spin_lock(&gt->gt_spin);
1072 gt->gt_log_flush_secs = args.ar_commit;
1073 spin_unlock(&gt->gt_spin);
1074
1075 return 0;
1076}
1077
1078/**
1079 * gfs2_drop_inode - Drop an inode (test for remote unlink)
1080 * @inode: The inode to drop
1081 *
1082 * If we've received a callback on an iopen lock then its because a
1083 * remote node tried to deallocate the inode but failed due to this node
1084 * still having the inode open. Here we mark the link count zero
1085 * since we know that it must have reached zero if the GLF_DEMOTE flag
1086 * is set on the iopen glock. If we didn't do a disk read since the
1087 * remote node removed the final link then we might otherwise miss
1088 * this event. This check ensures that this node will deallocate the
1089 * inode's blocks, or alternatively pass the baton on to another
1090 * node for later deallocation.
1091 */
1092
1093static void gfs2_drop_inode(struct inode *inode)
1094{
1095 struct gfs2_inode *ip = GFS2_I(inode);
1096
1097 if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
1098 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1099 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1100 clear_nlink(inode);
1101 }
1102 generic_drop_inode(inode);
1103}
1104
1105/**
1106 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
1107 * @inode: The VFS inode
1108 *
1109 */
1110
1111static void gfs2_clear_inode(struct inode *inode)
1112{
1113 struct gfs2_inode *ip = GFS2_I(inode);
1114
1115 /* This tells us its a "real" inode and not one which only
1116 * serves to contain an address space (see rgrp.c, meta_io.c)
1117 * which therefore doesn't have its own glocks.
1118 */
1119 if (test_bit(GIF_USER, &ip->i_flags)) {
1120 ip->i_gl->gl_object = NULL;
1121 gfs2_glock_put(ip->i_gl);
1122 ip->i_gl = NULL;
1123 if (ip->i_iopen_gh.gh_gl) {
1124 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1125 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1126 }
1127 }
1128}
1129
1130static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
1131{
1132 do {
1133 if (d1 == d2)
1134 return 1;
1135 d1 = d1->d_parent;
1136 } while (!IS_ROOT(d1));
1137 return 0;
1138}
1139
1140/**
1141 * gfs2_show_options - Show mount options for /proc/mounts
1142 * @s: seq_file structure
1143 * @mnt: vfsmount
1144 *
1145 * Returns: 0 on success or error code
1146 */
1147
1148static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1149{
1150 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
1151 struct gfs2_args *args = &sdp->sd_args;
1152 int lfsecs;
1153
1154 if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
1155 seq_printf(s, ",meta");
1156 if (args->ar_lockproto[0])
1157 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
1158 if (args->ar_locktable[0])
1159 seq_printf(s, ",locktable=%s", args->ar_locktable);
1160 if (args->ar_hostdata[0])
1161 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1162 if (args->ar_spectator)
1163 seq_printf(s, ",spectator");
1164 if (args->ar_ignore_local_fs)
1165 seq_printf(s, ",ignore_local_fs");
1166 if (args->ar_localflocks)
1167 seq_printf(s, ",localflocks");
1168 if (args->ar_localcaching)
1169 seq_printf(s, ",localcaching");
1170 if (args->ar_debug)
1171 seq_printf(s, ",debug");
1172 if (args->ar_upgrade)
1173 seq_printf(s, ",upgrade");
1174 if (args->ar_posix_acl)
1175 seq_printf(s, ",acl");
1176 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
1177 char *state;
1178 switch (args->ar_quota) {
1179 case GFS2_QUOTA_OFF:
1180 state = "off";
1181 break;
1182 case GFS2_QUOTA_ACCOUNT:
1183 state = "account";
1184 break;
1185 case GFS2_QUOTA_ON:
1186 state = "on";
1187 break;
1188 default:
1189 state = "unknown";
1190 break;
1191 }
1192 seq_printf(s, ",quota=%s", state);
1193 }
1194 if (args->ar_suiddir)
1195 seq_printf(s, ",suiddir");
1196 if (args->ar_data != GFS2_DATA_DEFAULT) {
1197 char *state;
1198 switch (args->ar_data) {
1199 case GFS2_DATA_WRITEBACK:
1200 state = "writeback";
1201 break;
1202 case GFS2_DATA_ORDERED:
1203 state = "ordered";
1204 break;
1205 default:
1206 state = "unknown";
1207 break;
1208 }
1209 seq_printf(s, ",data=%s", state);
1210 }
1211 if (args->ar_discard)
1212 seq_printf(s, ",discard");
1213 lfsecs = sdp->sd_tune.gt_log_flush_secs;
1214 if (lfsecs != 60)
1215 seq_printf(s, ",commit=%d", lfsecs);
1216 return 0;
1217}
1218
1219/*
1220 * We have to (at the moment) hold the inodes main lock to cover
1221 * the gap between unlocking the shared lock on the iopen lock and
1222 * taking the exclusive lock. I'd rather do a shared -> exclusive
1223 * conversion on the iopen lock, but we can change that later. This
1224 * is safe, just less efficient.
1225 */
1226
1227static void gfs2_delete_inode(struct inode *inode)
1228{
1229 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
1230 struct gfs2_inode *ip = GFS2_I(inode);
1231 struct gfs2_holder gh;
1232 int error;
1233
1234 if (!test_bit(GIF_USER, &ip->i_flags))
1235 goto out;
1236
1237 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1238 if (unlikely(error)) {
1239 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1240 goto out;
1241 }
1242
1243 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1244 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
1245 error = gfs2_glock_nq(&ip->i_iopen_gh);
1246 if (error)
1247 goto out_truncate;
1248
1249 if (S_ISDIR(inode->i_mode) &&
1250 (ip->i_diskflags & GFS2_DIF_EXHASH)) {
1251 error = gfs2_dir_exhash_dealloc(ip);
1252 if (error)
1253 goto out_unlock;
1254 }
1255
1256 if (ip->i_eattr) {
1257 error = gfs2_ea_dealloc(ip);
1258 if (error)
1259 goto out_unlock;
1260 }
1261
1262 if (!gfs2_is_stuffed(ip)) {
1263 error = gfs2_file_dealloc(ip);
1264 if (error)
1265 goto out_unlock;
1266 }
1267
1268 error = gfs2_dinode_dealloc(ip);
1269 if (error)
1270 goto out_unlock;
1271
1272out_truncate:
1273 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1274 if (error)
1275 goto out_unlock;
1276 /* Needs to be done before glock release & also in a transaction */
1277 truncate_inode_pages(&inode->i_data, 0);
1278 gfs2_trans_end(sdp);
1279
1280out_unlock:
1281 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
1282 gfs2_glock_dq(&ip->i_iopen_gh);
1283 gfs2_holder_uninit(&ip->i_iopen_gh);
1284 gfs2_glock_dq_uninit(&gh);
1285 if (error && error != GLR_TRYFAILED && error != -EROFS)
1286 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
1287out:
1288 truncate_inode_pages(&inode->i_data, 0);
1289 clear_inode(inode);
1290}
1291
1292static struct inode *gfs2_alloc_inode(struct super_block *sb)
1293{
1294 struct gfs2_inode *ip;
1295
1296 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
1297 if (ip) {
1298 ip->i_flags = 0;
1299 ip->i_gl = NULL;
1300 }
1301 return &ip->i_inode;
1302}
1303
1304static void gfs2_destroy_inode(struct inode *inode)
1305{
1306 kmem_cache_free(gfs2_inode_cachep, inode);
1307}
1308
1309const struct super_operations gfs2_super_ops = {
1310 .alloc_inode = gfs2_alloc_inode,
1311 .destroy_inode = gfs2_destroy_inode,
1312 .write_inode = gfs2_write_inode,
1313 .delete_inode = gfs2_delete_inode,
1314 .put_super = gfs2_put_super,
1315 .sync_fs = gfs2_sync_fs,
1316 .freeze_fs = gfs2_freeze,
1317 .unfreeze_fs = gfs2_unfreeze,
1318 .statfs = gfs2_statfs,
1319 .remount_fs = gfs2_remount_fs,
1320 .clear_inode = gfs2_clear_inode,
1321 .drop_inode = gfs2_drop_inode,
1322 .show_options = gfs2_show_options,
1323};
1324
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7655f5025fec..23419dc3027b 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,6 +26,36 @@
26#include "util.h" 26#include "util.h"
27#include "glops.h" 27#include "glops.h"
28 28
29struct gfs2_attr {
30 struct attribute attr;
31 ssize_t (*show)(struct gfs2_sbd *, char *);
32 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
33};
34
35static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
36 char *buf)
37{
38 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
39 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
40 return a->show ? a->show(sdp, buf) : 0;
41}
42
43static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
44 const char *buf, size_t len)
45{
46 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
47 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
48 return a->store ? a->store(sdp, buf, len) : len;
49}
50
51static struct sysfs_ops gfs2_attr_ops = {
52 .show = gfs2_attr_show,
53 .store = gfs2_attr_store,
54};
55
56
57static struct kset *gfs2_kset;
58
29static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) 59static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
30{ 60{
31 return snprintf(buf, PAGE_SIZE, "%u:%u\n", 61 return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
212 return len; 242 return len;
213} 243}
214 244
215struct gfs2_attr {
216 struct attribute attr;
217 ssize_t (*show)(struct gfs2_sbd *, char *);
218 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
219};
220 245
221#define GFS2_ATTR(name, mode, show, store) \ 246#define GFS2_ATTR(name, mode, show, store) \
222static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) 247static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
@@ -246,58 +271,11 @@ static struct attribute *gfs2_attrs[] = {
246 NULL, 271 NULL,
247}; 272};
248 273
249static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
250 char *buf)
251{
252 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
253 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
254 return a->show ? a->show(sdp, buf) : 0;
255}
256
257static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
258 const char *buf, size_t len)
259{
260 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
261 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
262 return a->store ? a->store(sdp, buf, len) : len;
263}
264
265static struct sysfs_ops gfs2_attr_ops = {
266 .show = gfs2_attr_show,
267 .store = gfs2_attr_store,
268};
269
270static struct kobj_type gfs2_ktype = { 274static struct kobj_type gfs2_ktype = {
271 .default_attrs = gfs2_attrs, 275 .default_attrs = gfs2_attrs,
272 .sysfs_ops = &gfs2_attr_ops, 276 .sysfs_ops = &gfs2_attr_ops,
273}; 277};
274 278
275static struct kset *gfs2_kset;
276
277/*
278 * display struct lm_lockstruct fields
279 */
280
281struct lockstruct_attr {
282 struct attribute attr;
283 ssize_t (*show)(struct gfs2_sbd *, char *);
284};
285
286#define LOCKSTRUCT_ATTR(name, fmt) \
287static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
288{ \
289 return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
290} \
291static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
292
293LOCKSTRUCT_ATTR(jid, "%u\n");
294LOCKSTRUCT_ATTR(first, "%u\n");
295
296static struct attribute *lockstruct_attrs[] = {
297 &lockstruct_attr_jid.attr,
298 &lockstruct_attr_first.attr,
299 NULL,
300};
301 279
302/* 280/*
303 * lock_module. Originally from lock_dlm 281 * lock_module. Originally from lock_dlm
@@ -359,34 +337,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
359 return sprintf(buf, "%d\n", ls->ls_first_done); 337 return sprintf(buf, "%d\n", ls->ls_first_done);
360} 338}
361 339
362static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf) 340static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
363{
364 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
365 return sprintf(buf, "%d\n", ls->ls_recover_jid);
366}
367
368static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
369{ 341{
342 unsigned jid;
370 struct gfs2_jdesc *jd; 343 struct gfs2_jdesc *jd;
344 int rv;
345
346 rv = sscanf(buf, "%u", &jid);
347 if (rv != 1)
348 return -EINVAL;
371 349
350 rv = -ESHUTDOWN;
372 spin_lock(&sdp->sd_jindex_spin); 351 spin_lock(&sdp->sd_jindex_spin);
352 if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
353 goto out;
354 rv = -EBUSY;
355 if (sdp->sd_jdesc->jd_jid == jid)
356 goto out;
357 rv = -ENOENT;
373 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 358 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
374 if (jd->jd_jid != jid) 359 if (jd->jd_jid != jid)
375 continue; 360 continue;
376 jd->jd_dirty = 1; 361 rv = slow_work_enqueue(&jd->jd_work);
377 break; 362 break;
378 } 363 }
364out:
379 spin_unlock(&sdp->sd_jindex_spin); 365 spin_unlock(&sdp->sd_jindex_spin);
380} 366 return rv ? rv : len;
381
382static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
383{
384 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
385 ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
386 gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
387 if (sdp->sd_recoverd_process)
388 wake_up_process(sdp->sd_recoverd_process);
389 return len;
390} 367}
391 368
392static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf) 369static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
@@ -401,31 +378,31 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
401 return sprintf(buf, "%d\n", ls->ls_recover_jid_status); 378 return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
402} 379}
403 380
404struct gdlm_attr { 381static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
405 struct attribute attr; 382{
406 ssize_t (*show)(struct gfs2_sbd *sdp, char *); 383 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
407 ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t); 384}
408};
409 385
410#define GDLM_ATTR(_name,_mode,_show,_store) \ 386#define GDLM_ATTR(_name,_mode,_show,_store) \
411static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) 387static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
412 388
413GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); 389GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
414GDLM_ATTR(block, 0644, block_show, block_store); 390GDLM_ATTR(block, 0644, block_show, block_store);
415GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 391GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
416GDLM_ATTR(id, 0444, lkid_show, NULL); 392GDLM_ATTR(id, 0444, lkid_show, NULL);
417GDLM_ATTR(first, 0444, lkfirst_show, NULL); 393GDLM_ATTR(jid, 0444, jid_show, NULL);
418GDLM_ATTR(first_done, 0444, first_done_show, NULL); 394GDLM_ATTR(first, 0444, lkfirst_show, NULL);
419GDLM_ATTR(recover, 0644, recover_show, recover_store); 395GDLM_ATTR(first_done, 0444, first_done_show, NULL);
420GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); 396GDLM_ATTR(recover, 0200, NULL, recover_store);
421GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); 397GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
398GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
422 399
423static struct attribute *lock_module_attrs[] = { 400static struct attribute *lock_module_attrs[] = {
424 &gdlm_attr_proto_name.attr, 401 &gdlm_attr_proto_name.attr,
425 &gdlm_attr_block.attr, 402 &gdlm_attr_block.attr,
426 &gdlm_attr_withdraw.attr, 403 &gdlm_attr_withdraw.attr,
427 &gdlm_attr_id.attr, 404 &gdlm_attr_id.attr,
428 &lockstruct_attr_jid.attr, 405 &gdlm_attr_jid.attr,
429 &gdlm_attr_first.attr, 406 &gdlm_attr_first.attr,
430 &gdlm_attr_first_done.attr, 407 &gdlm_attr_first_done.attr,
431 &gdlm_attr_recover.attr, 408 &gdlm_attr_recover.attr,
@@ -435,53 +412,6 @@ static struct attribute *lock_module_attrs[] = {
435}; 412};
436 413
437/* 414/*
438 * display struct gfs2_args fields
439 */
440
441struct args_attr {
442 struct attribute attr;
443 ssize_t (*show)(struct gfs2_sbd *, char *);
444};
445
446#define ARGS_ATTR(name, fmt) \
447static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
448{ \
449 return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \
450} \
451static struct args_attr args_attr_##name = __ATTR_RO(name)
452
453ARGS_ATTR(lockproto, "%s\n");
454ARGS_ATTR(locktable, "%s\n");
455ARGS_ATTR(hostdata, "%s\n");
456ARGS_ATTR(spectator, "%d\n");
457ARGS_ATTR(ignore_local_fs, "%d\n");
458ARGS_ATTR(localcaching, "%d\n");
459ARGS_ATTR(localflocks, "%d\n");
460ARGS_ATTR(debug, "%d\n");
461ARGS_ATTR(upgrade, "%d\n");
462ARGS_ATTR(posix_acl, "%d\n");
463ARGS_ATTR(quota, "%u\n");
464ARGS_ATTR(suiddir, "%d\n");
465ARGS_ATTR(data, "%d\n");
466
467static struct attribute *args_attrs[] = {
468 &args_attr_lockproto.attr,
469 &args_attr_locktable.attr,
470 &args_attr_hostdata.attr,
471 &args_attr_spectator.attr,
472 &args_attr_ignore_local_fs.attr,
473 &args_attr_localcaching.attr,
474 &args_attr_localflocks.attr,
475 &args_attr_debug.attr,
476 &args_attr_upgrade.attr,
477 &args_attr_posix_acl.attr,
478 &args_attr_quota.attr,
479 &args_attr_suiddir.attr,
480 &args_attr_data.attr,
481 NULL,
482};
483
484/*
485 * get and set struct gfs2_tune fields 415 * get and set struct gfs2_tune fields
486 */ 416 */
487 417
@@ -531,14 +461,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
531 return len; 461 return len;
532} 462}
533 463
534struct tune_attr {
535 struct attribute attr;
536 ssize_t (*show)(struct gfs2_sbd *, char *);
537 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
538};
539
540#define TUNE_ATTR_3(name, show, store) \ 464#define TUNE_ATTR_3(name, show, store) \
541static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store) 465static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
542 466
543#define TUNE_ATTR_2(name, store) \ 467#define TUNE_ATTR_2(name, store) \
544static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ 468static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
@@ -554,15 +478,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
554} \ 478} \
555TUNE_ATTR_2(name, name##_store) 479TUNE_ATTR_2(name, name##_store)
556 480
557#define TUNE_ATTR_DAEMON(name, process) \
558static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
559{ \
560 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
561 wake_up_process(sdp->sd_##process); \
562 return r; \
563} \
564TUNE_ATTR_2(name, name##_store)
565
566TUNE_ATTR(incore_log_blocks, 0); 481TUNE_ATTR(incore_log_blocks, 0);
567TUNE_ATTR(log_flush_secs, 0); 482TUNE_ATTR(log_flush_secs, 0);
568TUNE_ATTR(quota_warn_period, 0); 483TUNE_ATTR(quota_warn_period, 0);
@@ -574,8 +489,6 @@ TUNE_ATTR(new_files_jdata, 0);
574TUNE_ATTR(quota_simul_sync, 1); 489TUNE_ATTR(quota_simul_sync, 1);
575TUNE_ATTR(stall_secs, 1); 490TUNE_ATTR(stall_secs, 1);
576TUNE_ATTR(statfs_quantum, 1); 491TUNE_ATTR(statfs_quantum, 1);
577TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
578TUNE_ATTR_DAEMON(logd_secs, logd_process);
579TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 492TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
580 493
581static struct attribute *tune_attrs[] = { 494static struct attribute *tune_attrs[] = {
@@ -589,23 +502,11 @@ static struct attribute *tune_attrs[] = {
589 &tune_attr_quota_simul_sync.attr, 502 &tune_attr_quota_simul_sync.attr,
590 &tune_attr_stall_secs.attr, 503 &tune_attr_stall_secs.attr,
591 &tune_attr_statfs_quantum.attr, 504 &tune_attr_statfs_quantum.attr,
592 &tune_attr_recoverd_secs.attr,
593 &tune_attr_logd_secs.attr,
594 &tune_attr_quota_scale.attr, 505 &tune_attr_quota_scale.attr,
595 &tune_attr_new_files_jdata.attr, 506 &tune_attr_new_files_jdata.attr,
596 NULL, 507 NULL,
597}; 508};
598 509
599static struct attribute_group lockstruct_group = {
600 .name = "lockstruct",
601 .attrs = lockstruct_attrs,
602};
603
604static struct attribute_group args_group = {
605 .name = "args",
606 .attrs = args_attrs,
607};
608
609static struct attribute_group tune_group = { 510static struct attribute_group tune_group = {
610 .name = "tune", 511 .name = "tune",
611 .attrs = tune_attrs, 512 .attrs = tune_attrs,
@@ -626,17 +527,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
626 if (error) 527 if (error)
627 goto fail; 528 goto fail;
628 529
629 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
630 if (error)
631 goto fail_reg;
632
633 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
634 if (error)
635 goto fail_lockstruct;
636
637 error = sysfs_create_group(&sdp->sd_kobj, &tune_group); 530 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
638 if (error) 531 if (error)
639 goto fail_args; 532 goto fail_reg;
640 533
641 error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group); 534 error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
642 if (error) 535 if (error)
@@ -647,10 +540,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
647 540
648fail_tune: 541fail_tune:
649 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 542 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
650fail_args:
651 sysfs_remove_group(&sdp->sd_kobj, &args_group);
652fail_lockstruct:
653 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
654fail_reg: 543fail_reg:
655 kobject_put(&sdp->sd_kobj); 544 kobject_put(&sdp->sd_kobj);
656fail: 545fail:
@@ -661,8 +550,6 @@ fail:
661void gfs2_sys_fs_del(struct gfs2_sbd *sdp) 550void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
662{ 551{
663 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 552 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
664 sysfs_remove_group(&sdp->sd_kobj, &args_group);
665 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
666 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); 553 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
667 kobject_put(&sdp->sd_kobj); 554 kobject_put(&sdp->sd_kobj);
668} 555}
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
new file mode 100644
index 000000000000..98d6ef1c1dc0
--- /dev/null
+++ b/fs/gfs2/trace_gfs2.h
@@ -0,0 +1,407 @@
1#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_GFS2_H
3
4#include <linux/tracepoint.h>
5
6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM gfs2
8#define TRACE_INCLUDE_FILE trace_gfs2
9
10#include <linux/fs.h>
11#include <linux/buffer_head.h>
12#include <linux/dlmconstants.h>
13#include <linux/gfs2_ondisk.h>
14#include "incore.h"
15#include "glock.h"
16
17#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
18#define glock_trace_name(x) __print_symbolic(x, \
19 dlm_state_name(IV), \
20 dlm_state_name(NL), \
21 dlm_state_name(CR), \
22 dlm_state_name(CW), \
23 dlm_state_name(PR), \
24 dlm_state_name(PW), \
25 dlm_state_name(EX))
26
27#define block_state_name(x) __print_symbolic(x, \
28 { GFS2_BLKST_FREE, "free" }, \
29 { GFS2_BLKST_USED, "used" }, \
30 { GFS2_BLKST_DINODE, "dinode" }, \
31 { GFS2_BLKST_UNLINKED, "unlinked" })
32
33#define show_glock_flags(flags) __print_flags(flags, "", \
34 {(1UL << GLF_LOCK), "l" }, \
35 {(1UL << GLF_DEMOTE), "D" }, \
36 {(1UL << GLF_PENDING_DEMOTE), "d" }, \
37 {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \
38 {(1UL << GLF_DIRTY), "y" }, \
39 {(1UL << GLF_LFLUSH), "f" }, \
40 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
41 {(1UL << GLF_REPLY_PENDING), "r" }, \
42 {(1UL << GLF_INITIAL), "I" }, \
43 {(1UL << GLF_FROZEN), "F" })
44
45#ifndef NUMPTY
46#define NUMPTY
47static inline u8 glock_trace_state(unsigned int state)
48{
49 switch(state) {
50 case LM_ST_SHARED:
51 return DLM_LOCK_PR;
52 case LM_ST_DEFERRED:
53 return DLM_LOCK_CW;
54 case LM_ST_EXCLUSIVE:
55 return DLM_LOCK_EX;
56 }
57 return DLM_LOCK_NL;
58}
59#endif
60
61/* Section 1 - Locking
62 *
63 * Objectives:
64 * Latency: Remote demote request to state change
65 * Latency: Local lock request to state change
66 * Latency: State change to lock grant
67 * Correctness: Ordering of local lock state vs. I/O requests
68 * Correctness: Responses to remote demote requests
69 */
70
71/* General glock state change (DLM lock request completes) */
72TRACE_EVENT(gfs2_glock_state_change,
73
74 TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state),
75
76 TP_ARGS(gl, new_state),
77
78 TP_STRUCT__entry(
79 __field( dev_t, dev )
80 __field( u64, glnum )
81 __field( u32, gltype )
82 __field( u8, cur_state )
83 __field( u8, new_state )
84 __field( u8, dmt_state )
85 __field( u8, tgt_state )
86 __field( unsigned long, flags )
87 ),
88
89 TP_fast_assign(
90 __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
91 __entry->glnum = gl->gl_name.ln_number;
92 __entry->gltype = gl->gl_name.ln_type;
93 __entry->cur_state = glock_trace_state(gl->gl_state);
94 __entry->new_state = glock_trace_state(new_state);
95 __entry->tgt_state = glock_trace_state(gl->gl_target);
96 __entry->dmt_state = glock_trace_state(gl->gl_demote_state);
97 __entry->flags = gl->gl_flags;
98 ),
99
100 TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s",
101 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
102 (unsigned long long)__entry->glnum,
103 glock_trace_name(__entry->cur_state),
104 glock_trace_name(__entry->new_state),
105 glock_trace_name(__entry->tgt_state),
106 glock_trace_name(__entry->dmt_state),
107 show_glock_flags(__entry->flags))
108);
109
110/* State change -> unlocked, glock is being deallocated */
111TRACE_EVENT(gfs2_glock_put,
112
113 TP_PROTO(const struct gfs2_glock *gl),
114
115 TP_ARGS(gl),
116
117 TP_STRUCT__entry(
118 __field( dev_t, dev )
119 __field( u64, glnum )
120 __field( u32, gltype )
121 __field( u8, cur_state )
122 __field( unsigned long, flags )
123 ),
124
125 TP_fast_assign(
126 __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
127 __entry->gltype = gl->gl_name.ln_type;
128 __entry->glnum = gl->gl_name.ln_number;
129 __entry->cur_state = glock_trace_state(gl->gl_state);
130 __entry->flags = gl->gl_flags;
131 ),
132
133 TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s",
134 MAJOR(__entry->dev), MINOR(__entry->dev),
135 __entry->gltype, (unsigned long long)__entry->glnum,
136 glock_trace_name(__entry->cur_state),
137 glock_trace_name(DLM_LOCK_IV),
138 show_glock_flags(__entry->flags))
139
140);
141
142/* Callback (local or remote) requesting lock demotion */
143TRACE_EVENT(gfs2_demote_rq,
144
145 TP_PROTO(const struct gfs2_glock *gl),
146
147 TP_ARGS(gl),
148
149 TP_STRUCT__entry(
150 __field( dev_t, dev )
151 __field( u64, glnum )
152 __field( u32, gltype )
153 __field( u8, cur_state )
154 __field( u8, dmt_state )
155 __field( unsigned long, flags )
156 ),
157
158 TP_fast_assign(
159 __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
160 __entry->gltype = gl->gl_name.ln_type;
161 __entry->glnum = gl->gl_name.ln_number;
162 __entry->cur_state = glock_trace_state(gl->gl_state);
163 __entry->dmt_state = glock_trace_state(gl->gl_demote_state);
164 __entry->flags = gl->gl_flags;
165 ),
166
167 TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
168 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
169 (unsigned long long)__entry->glnum,
170 glock_trace_name(__entry->cur_state),
171 glock_trace_name(__entry->dmt_state),
172 show_glock_flags(__entry->flags))
173
174);
175
176/* Promotion/grant of a glock */
177TRACE_EVENT(gfs2_promote,
178
179 TP_PROTO(const struct gfs2_holder *gh, int first),
180
181 TP_ARGS(gh, first),
182
183 TP_STRUCT__entry(
184 __field( dev_t, dev )
185 __field( u64, glnum )
186 __field( u32, gltype )
187 __field( int, first )
188 __field( u8, state )
189 ),
190
191 TP_fast_assign(
192 __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
193 __entry->glnum = gh->gh_gl->gl_name.ln_number;
194 __entry->gltype = gh->gh_gl->gl_name.ln_type;
195 __entry->first = first;
196 __entry->state = glock_trace_state(gh->gh_state);
197 ),
198
199 TP_printk("%u,%u glock %u:%llu promote %s %s",
200 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
201 (unsigned long long)__entry->glnum,
202 __entry->first ? "first": "other",
203 glock_trace_name(__entry->state))
204);
205
206/* Queue/dequeue a lock request */
207TRACE_EVENT(gfs2_glock_queue,
208
209 TP_PROTO(const struct gfs2_holder *gh, int queue),
210
211 TP_ARGS(gh, queue),
212
213 TP_STRUCT__entry(
214 __field( dev_t, dev )
215 __field( u64, glnum )
216 __field( u32, gltype )
217 __field( int, queue )
218 __field( u8, state )
219 ),
220
221 TP_fast_assign(
222 __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
223 __entry->glnum = gh->gh_gl->gl_name.ln_number;
224 __entry->gltype = gh->gh_gl->gl_name.ln_type;
225 __entry->queue = queue;
226 __entry->state = glock_trace_state(gh->gh_state);
227 ),
228
229 TP_printk("%u,%u glock %u:%llu %squeue %s",
230 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
231 (unsigned long long)__entry->glnum,
232 __entry->queue ? "" : "de",
233 glock_trace_name(__entry->state))
234);
235
236/* Section 2 - Log/journal
237 *
238 * Objectives:
239 * Latency: Log flush time
240 * Correctness: pin/unpin vs. disk I/O ordering
241 * Performance: Log usage stats
242 */
243
244/* Pin/unpin a block in the log */
245TRACE_EVENT(gfs2_pin,
246
247 TP_PROTO(const struct gfs2_bufdata *bd, int pin),
248
249 TP_ARGS(bd, pin),
250
251 TP_STRUCT__entry(
252 __field( dev_t, dev )
253 __field( int, pin )
254 __field( u32, len )
255 __field( sector_t, block )
256 __field( u64, ino )
257 ),
258
259 TP_fast_assign(
260 __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev;
261 __entry->pin = pin;
262 __entry->len = bd->bd_bh->b_size;
263 __entry->block = bd->bd_bh->b_blocknr;
264 __entry->ino = bd->bd_gl->gl_name.ln_number;
265 ),
266
267 TP_printk("%u,%u log %s %llu/%lu inode %llu",
268 MAJOR(__entry->dev), MINOR(__entry->dev),
269 __entry->pin ? "pin" : "unpin",
270 (unsigned long long)__entry->block,
271 (unsigned long)__entry->len,
272 (unsigned long long)__entry->ino)
273);
274
275/* Flushing the log */
276TRACE_EVENT(gfs2_log_flush,
277
278 TP_PROTO(const struct gfs2_sbd *sdp, int start),
279
280 TP_ARGS(sdp, start),
281
282 TP_STRUCT__entry(
283 __field( dev_t, dev )
284 __field( int, start )
285 __field( u64, log_seq )
286 ),
287
288 TP_fast_assign(
289 __entry->dev = sdp->sd_vfs->s_dev;
290 __entry->start = start;
291 __entry->log_seq = sdp->sd_log_sequence;
292 ),
293
294 TP_printk("%u,%u log flush %s %llu",
295 MAJOR(__entry->dev), MINOR(__entry->dev),
296 __entry->start ? "start" : "end",
297 (unsigned long long)__entry->log_seq)
298);
299
300/* Reserving/releasing blocks in the log */
301TRACE_EVENT(gfs2_log_blocks,
302
303 TP_PROTO(const struct gfs2_sbd *sdp, int blocks),
304
305 TP_ARGS(sdp, blocks),
306
307 TP_STRUCT__entry(
308 __field( dev_t, dev )
309 __field( int, blocks )
310 ),
311
312 TP_fast_assign(
313 __entry->dev = sdp->sd_vfs->s_dev;
314 __entry->blocks = blocks;
315 ),
316
317 TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev),
318 MINOR(__entry->dev), __entry->blocks)
319);
320
321/* Section 3 - bmap
322 *
323 * Objectives:
324 * Latency: Bmap request time
325 * Performance: Block allocator tracing
326 * Correctness: Test of disard generation vs. blocks allocated
327 */
328
329/* Map an extent of blocks, possibly a new allocation */
330TRACE_EVENT(gfs2_bmap,
331
332 TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh,
333 sector_t lblock, int create, int errno),
334
335 TP_ARGS(ip, bh, lblock, create, errno),
336
337 TP_STRUCT__entry(
338 __field( dev_t, dev )
339 __field( sector_t, lblock )
340 __field( sector_t, pblock )
341 __field( u64, inum )
342 __field( unsigned long, state )
343 __field( u32, len )
344 __field( int, create )
345 __field( int, errno )
346 ),
347
348 TP_fast_assign(
349 __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev;
350 __entry->lblock = lblock;
351 __entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0;
352 __entry->inum = ip->i_no_addr;
353 __entry->state = bh->b_state;
354 __entry->len = bh->b_size;
355 __entry->create = create;
356 __entry->errno = errno;
357 ),
358
359 TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d",
360 MAJOR(__entry->dev), MINOR(__entry->dev),
361 (unsigned long long)__entry->inum,
362 (unsigned long long)__entry->lblock,
363 (unsigned long)__entry->len,
364 (unsigned long long)__entry->pblock,
365 __entry->state, __entry->create ? "create " : "nocreate",
366 __entry->errno)
367);
368
369/* Keep track of blocks as they are allocated/freed */
370TRACE_EVENT(gfs2_block_alloc,
371
372 TP_PROTO(const struct gfs2_inode *ip, u64 block, unsigned len,
373 u8 block_state),
374
375 TP_ARGS(ip, block, len, block_state),
376
377 TP_STRUCT__entry(
378 __field( dev_t, dev )
379 __field( u64, start )
380 __field( u64, inum )
381 __field( u32, len )
382 __field( u8, block_state )
383 ),
384
385 TP_fast_assign(
386 __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev;
387 __entry->start = block;
388 __entry->inum = ip->i_no_addr;
389 __entry->len = len;
390 __entry->block_state = block_state;
391 ),
392
393 TP_printk("%u,%u bmap %llu alloc %llu/%lu %s",
394 MAJOR(__entry->dev), MINOR(__entry->dev),
395 (unsigned long long)__entry->inum,
396 (unsigned long long)__entry->start,
397 (unsigned long)__entry->len,
398 block_state_name(__entry->block_state))
399);
400
401#endif /* _TRACE_GFS2_H */
402
403/* This part must be outside protection */
404#undef TRACE_INCLUDE_PATH
405#define TRACE_INCLUDE_PATH .
406#include <trace/define_trace.h>
407
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 053752d4b27f..4ef0e9fa3549 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
33 BUG_ON(current->journal_info); 33 BUG_ON(current->journal_info);
34 BUG_ON(blocks == 0 && revokes == 0); 34 BUG_ON(blocks == 0 && revokes == 0);
35 35
36 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
37 return -EROFS;
38
36 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); 39 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
37 if (!tr) 40 if (!tr)
38 return -ENOMEM; 41 return -ENOMEM;
@@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
54 if (error) 57 if (error)
55 goto fail_holder_uninit; 58 goto fail_holder_uninit;
56 59
57 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
58 tr->tr_t_gh.gh_flags |= GL_NOCACHE;
59 error = -EROFS;
60 goto fail_gunlock;
61 }
62
63 error = gfs2_log_reserve(sdp, tr->tr_reserved); 60 error = gfs2_log_reserve(sdp, tr->tr_reserved);
64 if (error) 61 if (error)
65 goto fail_gunlock; 62 goto fail_gunlock;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index a36bb749926d..6f833dc8e910 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -49,11 +49,23 @@ MODULE_LICENSE("GPL");
49 */ 49 */
50static void hfs_write_super(struct super_block *sb) 50static void hfs_write_super(struct super_block *sb)
51{ 51{
52 lock_super(sb);
52 sb->s_dirt = 0; 53 sb->s_dirt = 0;
53 if (sb->s_flags & MS_RDONLY) 54
54 return;
55 /* sync everything to the buffers */ 55 /* sync everything to the buffers */
56 if (!(sb->s_flags & MS_RDONLY))
57 hfs_mdb_commit(sb);
58 unlock_super(sb);
59}
60
61static int hfs_sync_fs(struct super_block *sb, int wait)
62{
63 lock_super(sb);
56 hfs_mdb_commit(sb); 64 hfs_mdb_commit(sb);
65 sb->s_dirt = 0;
66 unlock_super(sb);
67
68 return 0;
57} 69}
58 70
59/* 71/*
@@ -65,9 +77,15 @@ static void hfs_write_super(struct super_block *sb)
65 */ 77 */
66static void hfs_put_super(struct super_block *sb) 78static void hfs_put_super(struct super_block *sb)
67{ 79{
80 lock_kernel();
81
82 if (sb->s_dirt)
83 hfs_write_super(sb);
68 hfs_mdb_close(sb); 84 hfs_mdb_close(sb);
69 /* release the MDB's resources */ 85 /* release the MDB's resources */
70 hfs_mdb_put(sb); 86 hfs_mdb_put(sb);
87
88 unlock_kernel();
71} 89}
72 90
73/* 91/*
@@ -164,6 +182,7 @@ static const struct super_operations hfs_super_operations = {
164 .clear_inode = hfs_clear_inode, 182 .clear_inode = hfs_clear_inode,
165 .put_super = hfs_put_super, 183 .put_super = hfs_put_super,
166 .write_super = hfs_write_super, 184 .write_super = hfs_write_super,
185 .sync_fs = hfs_sync_fs,
167 .statfs = hfs_statfs, 186 .statfs = hfs_statfs,
168 .remount_fs = hfs_remount, 187 .remount_fs = hfs_remount,
169 .show_options = hfs_show_options, 188 .show_options = hfs_show_options,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index f2a64020f42e..9fc3af0c0dab 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -152,15 +152,14 @@ static void hfsplus_clear_inode(struct inode *inode)
152 } 152 }
153} 153}
154 154
155static void hfsplus_write_super(struct super_block *sb) 155static int hfsplus_sync_fs(struct super_block *sb, int wait)
156{ 156{
157 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 157 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
158 158
159 dprint(DBG_SUPER, "hfsplus_write_super\n"); 159 dprint(DBG_SUPER, "hfsplus_write_super\n");
160
161 lock_super(sb);
160 sb->s_dirt = 0; 162 sb->s_dirt = 0;
161 if (sb->s_flags & MS_RDONLY)
162 /* warn? */
163 return;
164 163
165 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); 164 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
166 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); 165 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -192,6 +191,16 @@ static void hfsplus_write_super(struct super_block *sb)
192 } 191 }
193 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP; 192 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
194 } 193 }
194 unlock_super(sb);
195 return 0;
196}
197
198static void hfsplus_write_super(struct super_block *sb)
199{
200 if (!(sb->s_flags & MS_RDONLY))
201 hfsplus_sync_fs(sb, 1);
202 else
203 sb->s_dirt = 0;
195} 204}
196 205
197static void hfsplus_put_super(struct super_block *sb) 206static void hfsplus_put_super(struct super_block *sb)
@@ -199,6 +208,11 @@ static void hfsplus_put_super(struct super_block *sb)
199 dprint(DBG_SUPER, "hfsplus_put_super\n"); 208 dprint(DBG_SUPER, "hfsplus_put_super\n");
200 if (!sb->s_fs_info) 209 if (!sb->s_fs_info)
201 return; 210 return;
211
212 lock_kernel();
213
214 if (sb->s_dirt)
215 hfsplus_write_super(sb);
202 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { 216 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
203 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 217 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
204 218
@@ -218,6 +232,8 @@ static void hfsplus_put_super(struct super_block *sb)
218 unload_nls(HFSPLUS_SB(sb).nls); 232 unload_nls(HFSPLUS_SB(sb).nls);
219 kfree(sb->s_fs_info); 233 kfree(sb->s_fs_info);
220 sb->s_fs_info = NULL; 234 sb->s_fs_info = NULL;
235
236 unlock_kernel();
221} 237}
222 238
223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 239static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -279,6 +295,7 @@ static const struct super_operations hfsplus_sops = {
279 .clear_inode = hfsplus_clear_inode, 295 .clear_inode = hfsplus_clear_inode,
280 .put_super = hfsplus_put_super, 296 .put_super = hfsplus_put_super,
281 .write_super = hfsplus_write_super, 297 .write_super = hfsplus_write_super,
298 .sync_fs = hfsplus_sync_fs,
282 .statfs = hfsplus_statfs, 299 .statfs = hfsplus_statfs,
283 .remount_fs = hfsplus_remount, 300 .remount_fs = hfsplus_remount,
284 .show_options = hfsplus_show_options, 301 .show_options = hfsplus_show_options,
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fc77965be841..f2feaa06bf26 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/magic.h> 14#include <linux/magic.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h>
16 17
17/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ 18/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
18 19
@@ -99,11 +100,16 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
99static void hpfs_put_super(struct super_block *s) 100static void hpfs_put_super(struct super_block *s)
100{ 101{
101 struct hpfs_sb_info *sbi = hpfs_sb(s); 102 struct hpfs_sb_info *sbi = hpfs_sb(s);
103
104 lock_kernel();
105
102 kfree(sbi->sb_cp_table); 106 kfree(sbi->sb_cp_table);
103 kfree(sbi->sb_bmp_dir); 107 kfree(sbi->sb_bmp_dir);
104 unmark_dirty(s); 108 unmark_dirty(s);
105 s->s_fs_info = NULL; 109 s->s_fs_info = NULL;
106 kfree(sbi); 110 kfree(sbi);
111
112 unlock_kernel();
107} 113}
108 114
109unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) 115unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -393,6 +399,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
393 399
394 *flags |= MS_NOATIME; 400 *flags |= MS_NOATIME;
395 401
402 lock_kernel();
403 lock_super(s);
396 uid = sbi->sb_uid; gid = sbi->sb_gid; 404 uid = sbi->sb_uid; gid = sbi->sb_gid;
397 umask = 0777 & ~sbi->sb_mode; 405 umask = 0777 & ~sbi->sb_mode;
398 lowercase = sbi->sb_lowercase; conv = sbi->sb_conv; 406 lowercase = sbi->sb_lowercase; conv = sbi->sb_conv;
@@ -425,9 +433,13 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
425 433
426 replace_mount_options(s, new_opts); 434 replace_mount_options(s, new_opts);
427 435
436 unlock_super(s);
437 unlock_kernel();
428 return 0; 438 return 0;
429 439
430out_err: 440out_err:
441 unlock_super(s);
442 unlock_kernel();
431 kfree(new_opts); 443 kfree(new_opts);
432 return -EINVAL; 444 return -EINVAL;
433} 445}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c1462d43e721..941c8425c10b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,6 +30,7 @@
30#include <linux/dnotify.h> 30#include <linux/dnotify.h>
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/ima.h>
33 34
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35 36
@@ -986,6 +987,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
986 &hugetlbfs_file_operations); 987 &hugetlbfs_file_operations);
987 if (!file) 988 if (!file)
988 goto out_dentry; /* inode is already attached */ 989 goto out_dentry; /* inode is already attached */
990 ima_counts_get(file);
989 991
990 return file; 992 return file;
991 993
diff --git a/fs/inode.c b/fs/inode.c
index bca0c618fdb3..a88baebf77cf 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
22#include <linux/cdev.h> 22#include <linux/cdev.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/inotify.h> 24#include <linux/inotify.h>
25#include <linux/fsnotify.h>
25#include <linux/mount.h> 26#include <linux/mount.h>
26#include <linux/async.h> 27#include <linux/async.h>
27 28
@@ -189,6 +190,10 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
189 inode->i_private = NULL; 190 inode->i_private = NULL;
190 inode->i_mapping = mapping; 191 inode->i_mapping = mapping;
191 192
193#ifdef CONFIG_FSNOTIFY
194 inode->i_fsnotify_mask = 0;
195#endif
196
192 return inode; 197 return inode;
193 198
194out_free_security: 199out_free_security:
@@ -221,6 +226,7 @@ void destroy_inode(struct inode *inode)
221 BUG_ON(inode_has_buffers(inode)); 226 BUG_ON(inode_has_buffers(inode));
222 ima_inode_free(inode); 227 ima_inode_free(inode);
223 security_inode_free(inode); 228 security_inode_free(inode);
229 fsnotify_inode_delete(inode);
224 if (inode->i_sb->s_op->destroy_inode) 230 if (inode->i_sb->s_op->destroy_inode)
225 inode->i_sb->s_op->destroy_inode(inode); 231 inode->i_sb->s_op->destroy_inode(inode);
226 else 232 else
@@ -252,6 +258,9 @@ void inode_init_once(struct inode *inode)
252 INIT_LIST_HEAD(&inode->inotify_watches); 258 INIT_LIST_HEAD(&inode->inotify_watches);
253 mutex_init(&inode->inotify_mutex); 259 mutex_init(&inode->inotify_mutex);
254#endif 260#endif
261#ifdef CONFIG_FSNOTIFY
262 INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
263#endif
255} 264}
256EXPORT_SYMBOL(inode_init_once); 265EXPORT_SYMBOL(inode_init_once);
257 266
@@ -398,6 +407,7 @@ int invalidate_inodes(struct super_block *sb)
398 mutex_lock(&iprune_mutex); 407 mutex_lock(&iprune_mutex);
399 spin_lock(&inode_lock); 408 spin_lock(&inode_lock);
400 inotify_unmount_inodes(&sb->s_inodes); 409 inotify_unmount_inodes(&sb->s_inodes);
410 fsnotify_unmount_inodes(&sb->s_inodes);
401 busy = invalidate_list(&sb->s_inodes, &throw_away); 411 busy = invalidate_list(&sb->s_inodes, &throw_away);
402 spin_unlock(&inode_lock); 412 spin_unlock(&inode_lock);
403 413
@@ -1412,7 +1422,7 @@ void file_update_time(struct file *file)
1412 if (IS_NOCMTIME(inode)) 1422 if (IS_NOCMTIME(inode))
1413 return; 1423 return;
1414 1424
1415 err = mnt_want_write(file->f_path.mnt); 1425 err = mnt_want_write_file(file);
1416 if (err) 1426 if (err)
1417 return; 1427 return;
1418 1428
diff --git a/fs/internal.h b/fs/internal.h
index b4dac4fb6b61..d55ef562f0bb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
25 return sb == blockdev_superblock; 25 return sb == blockdev_superblock;
26} 26}
27 27
28extern int __sync_blockdev(struct block_device *bdev, int wait);
29
28#else 30#else
29static inline void bdev_cache_init(void) 31static inline void bdev_cache_init(void)
30{ 32{
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
34{ 36{
35 return 0; 37 return 0;
36} 38}
39
40static inline int __sync_blockdev(struct block_device *bdev, int wait)
41{
42 return 0;
43}
37#endif 44#endif
38 45
39/* 46/*
@@ -66,3 +73,13 @@ extern void __init mnt_init(void);
66 * fs_struct.c 73 * fs_struct.c
67 */ 74 */
68extern void chroot_fs_refs(struct path *, struct path *); 75extern void chroot_fs_refs(struct path *, struct path *);
76
77/*
78 * file_table.c
79 */
80extern void mark_files_ro(struct super_block *);
81
82/*
83 * super.c
84 */
85extern int do_remount_sb(struct super_block *, int, void *, int);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 82d9c42b8bac..286f38dfc6c0 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
414 switch (cmd) { 414 switch (cmd) {
415 case FIBMAP: 415 case FIBMAP:
416 return ioctl_fibmap(filp, p); 416 return ioctl_fibmap(filp, p);
417 case FS_IOC_FIEMAP:
418 return ioctl_fiemap(filp, arg);
419 case FIGETBSZ:
420 return put_user(inode->i_sb->s_blocksize, p);
421 case FIONREAD: 417 case FIONREAD:
422 return put_user(i_size_read(inode) - filp->f_pos, p); 418 return put_user(i_size_read(inode) - filp->f_pos, p);
423 } 419 }
@@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
557 error = ioctl_fsthaw(filp); 553 error = ioctl_fsthaw(filp);
558 break; 554 break;
559 555
556 case FS_IOC_FIEMAP:
557 return ioctl_fiemap(filp, arg);
558
559 case FIGETBSZ:
560 {
561 struct inode *inode = filp->f_path.dentry->d_inode;
562 int __user *p = (int __user *)arg;
563 return put_user(inode->i_sb->s_blocksize, p);
564 }
565
560 default: 566 default:
561 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) 567 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
562 error = file_ioctl(filp, cmd, arg); 568 error = file_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index b4cbe9603c7d..068b34b5a107 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -42,11 +42,16 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst
42static void isofs_put_super(struct super_block *sb) 42static void isofs_put_super(struct super_block *sb)
43{ 43{
44 struct isofs_sb_info *sbi = ISOFS_SB(sb); 44 struct isofs_sb_info *sbi = ISOFS_SB(sb);
45
45#ifdef CONFIG_JOLIET 46#ifdef CONFIG_JOLIET
47 lock_kernel();
48
46 if (sbi->s_nls_iocharset) { 49 if (sbi->s_nls_iocharset) {
47 unload_nls(sbi->s_nls_iocharset); 50 unload_nls(sbi->s_nls_iocharset);
48 sbi->s_nls_iocharset = NULL; 51 sbi->s_nls_iocharset = NULL;
49 } 52 }
53
54 unlock_kernel();
50#endif 55#endif
51 56
52 kfree(sbi); 57 kfree(sbi);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 58144102bf25..62be7d294ec2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1781,7 +1781,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1781 * Journal abort has very specific semantics, which we describe 1781 * Journal abort has very specific semantics, which we describe
1782 * for journal abort. 1782 * for journal abort.
1783 * 1783 *
1784 * Two internal function, which provide abort to te jbd layer 1784 * Two internal functions, which provide abort to the jbd layer
1785 * itself are here. 1785 * itself are here.
1786 */ 1786 */
1787 1787
@@ -1879,7 +1879,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
1879 * int jbd2_journal_errno () - returns the journal's error state. 1879 * int jbd2_journal_errno () - returns the journal's error state.
1880 * @journal: journal to examine. 1880 * @journal: journal to examine.
1881 * 1881 *
1882 * This is the errno numbet set with jbd2_journal_abort(), the last 1882 * This is the errno number set with jbd2_journal_abort(), the last
1883 * time the journal was mounted - if the journal was stopped 1883 * time the journal was mounted - if the journal was stopped
1884 * without calling abort this will be 0. 1884 * without calling abort this will be 0.
1885 * 1885 *
@@ -1903,7 +1903,7 @@ int jbd2_journal_errno(journal_t *journal)
1903 * int jbd2_journal_clear_err () - clears the journal's error state 1903 * int jbd2_journal_clear_err () - clears the journal's error state
1904 * @journal: journal to act on. 1904 * @journal: journal to act on.
1905 * 1905 *
1906 * An error must be cleared or Acked to take a FS out of readonly 1906 * An error must be cleared or acked to take a FS out of readonly
1907 * mode. 1907 * mode.
1908 */ 1908 */
1909int jbd2_journal_clear_err(journal_t *journal) 1909int jbd2_journal_clear_err(journal_t *journal)
@@ -1923,7 +1923,7 @@ int jbd2_journal_clear_err(journal_t *journal)
1923 * void jbd2_journal_ack_err() - Ack journal err. 1923 * void jbd2_journal_ack_err() - Ack journal err.
1924 * @journal: journal to act on. 1924 * @journal: journal to act on.
1925 * 1925 *
1926 * An error must be cleared or Acked to take a FS out of readonly 1926 * An error must be cleared or acked to take a FS out of readonly
1927 * mode. 1927 * mode.
1928 */ 1928 */
1929void jbd2_journal_ack_err(journal_t *journal) 1929void jbd2_journal_ack_err(journal_t *journal)
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 249305d65d5b..3451a81b2142 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -20,6 +20,7 @@
20#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
21#include <linux/vfs.h> 21#include <linux/vfs.h>
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/smp_lock.h>
23#include "nodelist.h" 24#include "nodelist.h"
24 25
25static int jffs2_flash_setup(struct jffs2_sb_info *c); 26static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -387,6 +388,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
387 This also catches the case where it was stopped and this 388 This also catches the case where it was stopped and this
388 is just a remount to restart it. 389 is just a remount to restart it.
389 Flush the writebuffer, if neccecary, else we loose it */ 390 Flush the writebuffer, if neccecary, else we loose it */
391 lock_kernel();
390 if (!(sb->s_flags & MS_RDONLY)) { 392 if (!(sb->s_flags & MS_RDONLY)) {
391 jffs2_stop_garbage_collect_thread(c); 393 jffs2_stop_garbage_collect_thread(c);
392 mutex_lock(&c->alloc_sem); 394 mutex_lock(&c->alloc_sem);
@@ -399,24 +401,10 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
399 401
400 *flags |= MS_NOATIME; 402 *flags |= MS_NOATIME;
401 403
404 unlock_kernel();
402 return 0; 405 return 0;
403} 406}
404 407
405void jffs2_write_super (struct super_block *sb)
406{
407 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
408 sb->s_dirt = 0;
409
410 if (sb->s_flags & MS_RDONLY)
411 return;
412
413 D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
414 jffs2_garbage_collect_trigger(c);
415 jffs2_erase_pending_blocks(c, 0);
416 jffs2_flush_wbuf_gc(c, 0);
417}
418
419
420/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, 408/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
421 fill in the raw_inode while you're at it. */ 409 fill in the raw_inode while you're at it. */
422struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri) 410struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 5e194a5c8e29..2228380c47b9 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -181,7 +181,6 @@ void jffs2_dirty_inode(struct inode *inode);
181struct inode *jffs2_new_inode (struct inode *dir_i, int mode, 181struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
182 struct jffs2_raw_inode *ri); 182 struct jffs2_raw_inode *ri);
183int jffs2_statfs (struct dentry *, struct kstatfs *); 183int jffs2_statfs (struct dentry *, struct kstatfs *);
184void jffs2_write_super (struct super_block *);
185int jffs2_remount_fs (struct super_block *, int *, char *); 184int jffs2_remount_fs (struct super_block *, int *, char *);
186int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); 185int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
187void jffs2_gc_release_inode(struct jffs2_sb_info *c, 186void jffs2_gc_release_inode(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 4c4e18c54a51..07a22caf2687 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -53,10 +53,29 @@ static void jffs2_i_init_once(void *foo)
53 inode_init_once(&f->vfs_inode); 53 inode_init_once(&f->vfs_inode);
54} 54}
55 55
56static void jffs2_write_super(struct super_block *sb)
57{
58 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
59
60 lock_super(sb);
61 sb->s_dirt = 0;
62
63 if (!(sb->s_flags & MS_RDONLY)) {
64 D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
65 jffs2_garbage_collect_trigger(c);
66 jffs2_erase_pending_blocks(c, 0);
67 jffs2_flush_wbuf_gc(c, 0);
68 }
69
70 unlock_super(sb);
71}
72
56static int jffs2_sync_fs(struct super_block *sb, int wait) 73static int jffs2_sync_fs(struct super_block *sb, int wait)
57{ 74{
58 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 75 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
59 76
77 jffs2_write_super(sb);
78
60 mutex_lock(&c->alloc_sem); 79 mutex_lock(&c->alloc_sem);
61 jffs2_flush_wbuf_pad(c); 80 jffs2_flush_wbuf_pad(c);
62 mutex_unlock(&c->alloc_sem); 81 mutex_unlock(&c->alloc_sem);
@@ -174,6 +193,11 @@ static void jffs2_put_super (struct super_block *sb)
174 193
175 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); 194 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
176 195
196 lock_kernel();
197
198 if (sb->s_dirt)
199 jffs2_write_super(sb);
200
177 mutex_lock(&c->alloc_sem); 201 mutex_lock(&c->alloc_sem);
178 jffs2_flush_wbuf_pad(c); 202 jffs2_flush_wbuf_pad(c);
179 mutex_unlock(&c->alloc_sem); 203 mutex_unlock(&c->alloc_sem);
@@ -192,6 +216,8 @@ static void jffs2_put_super (struct super_block *sb)
192 if (c->mtd->sync) 216 if (c->mtd->sync)
193 c->mtd->sync(c->mtd); 217 c->mtd->sync(c->mtd);
194 218
219 unlock_kernel();
220
195 D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); 221 D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
196} 222}
197 223
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 346057218edc..0fc30407f039 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2571 2571
2572 txAbort(tid, 0); 2572 txAbort(tid, 0);
2573 txEnd(tid); 2573 txEnd(tid);
2574 mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
2574 2575
2575 /* release the inode map lock */ 2576 /* release the inode map lock */
2576 IWRITE_UNLOCK(ipimap); 2577 IWRITE_UNLOCK(ipimap);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6f21adf9479a..09b1b6ee2186 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -32,6 +32,7 @@
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <linux/seq_file.h> 34#include <linux/seq_file.h>
35#include <linux/smp_lock.h>
35 36
36#include "jfs_incore.h" 37#include "jfs_incore.h"
37#include "jfs_filsys.h" 38#include "jfs_filsys.h"
@@ -183,6 +184,9 @@ static void jfs_put_super(struct super_block *sb)
183 int rc; 184 int rc;
184 185
185 jfs_info("In jfs_put_super"); 186 jfs_info("In jfs_put_super");
187
188 lock_kernel();
189
186 rc = jfs_umount(sb); 190 rc = jfs_umount(sb);
187 if (rc) 191 if (rc)
188 jfs_err("jfs_umount failed with return code %d", rc); 192 jfs_err("jfs_umount failed with return code %d", rc);
@@ -195,6 +199,8 @@ static void jfs_put_super(struct super_block *sb)
195 sbi->direct_inode = NULL; 199 sbi->direct_inode = NULL;
196 200
197 kfree(sbi); 201 kfree(sbi);
202
203 unlock_kernel();
198} 204}
199 205
200enum { 206enum {
@@ -370,19 +376,24 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
370 s64 newLVSize = 0; 376 s64 newLVSize = 0;
371 int rc = 0; 377 int rc = 0;
372 int flag = JFS_SBI(sb)->flag; 378 int flag = JFS_SBI(sb)->flag;
379 int ret;
373 380
374 if (!parse_options(data, sb, &newLVSize, &flag)) { 381 if (!parse_options(data, sb, &newLVSize, &flag)) {
375 return -EINVAL; 382 return -EINVAL;
376 } 383 }
384 lock_kernel();
377 if (newLVSize) { 385 if (newLVSize) {
378 if (sb->s_flags & MS_RDONLY) { 386 if (sb->s_flags & MS_RDONLY) {
379 printk(KERN_ERR 387 printk(KERN_ERR
380 "JFS: resize requires volume to be mounted read-write\n"); 388 "JFS: resize requires volume to be mounted read-write\n");
389 unlock_kernel();
381 return -EROFS; 390 return -EROFS;
382 } 391 }
383 rc = jfs_extendfs(sb, newLVSize, 0); 392 rc = jfs_extendfs(sb, newLVSize, 0);
384 if (rc) 393 if (rc) {
394 unlock_kernel();
385 return rc; 395 return rc;
396 }
386 } 397 }
387 398
388 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 399 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -393,23 +404,31 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
393 truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); 404 truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
394 405
395 JFS_SBI(sb)->flag = flag; 406 JFS_SBI(sb)->flag = flag;
396 return jfs_mount_rw(sb, 1); 407 ret = jfs_mount_rw(sb, 1);
408 unlock_kernel();
409 return ret;
397 } 410 }
398 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 411 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
399 rc = jfs_umount_rw(sb); 412 rc = jfs_umount_rw(sb);
400 JFS_SBI(sb)->flag = flag; 413 JFS_SBI(sb)->flag = flag;
414 unlock_kernel();
401 return rc; 415 return rc;
402 } 416 }
403 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) 417 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
404 if (!(sb->s_flags & MS_RDONLY)) { 418 if (!(sb->s_flags & MS_RDONLY)) {
405 rc = jfs_umount_rw(sb); 419 rc = jfs_umount_rw(sb);
406 if (rc) 420 if (rc) {
421 unlock_kernel();
407 return rc; 422 return rc;
423 }
408 JFS_SBI(sb)->flag = flag; 424 JFS_SBI(sb)->flag = flag;
409 return jfs_mount_rw(sb, 1); 425 ret = jfs_mount_rw(sb, 1);
426 unlock_kernel();
427 return ret;
410 } 428 }
411 JFS_SBI(sb)->flag = flag; 429 JFS_SBI(sb)->flag = flag;
412 430
431 unlock_kernel();
413 return 0; 432 return 0;
414} 433}
415 434
@@ -720,8 +739,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
720 blk++; 739 blk++;
721 } 740 }
722out: 741out:
723 if (len == towrite) 742 if (len == towrite) {
743 mutex_unlock(&inode->i_mutex);
724 return err; 744 return err;
745 }
725 if (inode->i_size < off+len-towrite) 746 if (inode->i_size < off+len-towrite)
726 i_size_write(inode, off+len-towrite); 747 i_size_write(inode, off+len-towrite);
727 inode->i_version++; 748 inode->i_version++;
diff --git a/fs/libfs.c b/fs/libfs.c
index 80046ddf5063..ddfa89948c3f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -9,6 +9,8 @@
9#include <linux/vfs.h> 9#include <linux/vfs.h>
10#include <linux/mutex.h> 10#include <linux/mutex.h>
11#include <linux/exportfs.h> 11#include <linux/exportfs.h>
12#include <linux/writeback.h>
13#include <linux/buffer_head.h>
12 14
13#include <asm/uaccess.h> 15#include <asm/uaccess.h>
14 16
@@ -807,6 +809,29 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
807} 809}
808EXPORT_SYMBOL_GPL(generic_fh_to_parent); 810EXPORT_SYMBOL_GPL(generic_fh_to_parent);
809 811
812int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
813{
814 struct writeback_control wbc = {
815 .sync_mode = WB_SYNC_ALL,
816 .nr_to_write = 0, /* metadata-only; caller takes care of data */
817 };
818 struct inode *inode = dentry->d_inode;
819 int err;
820 int ret;
821
822 ret = sync_mapping_buffers(inode->i_mapping);
823 if (!(inode->i_state & I_DIRTY))
824 return ret;
825 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
826 return ret;
827
828 err = sync_inode(inode, &wbc);
829 if (ret == 0)
830 ret = err;
831 return ret;
832}
833EXPORT_SYMBOL(simple_fsync);
834
810EXPORT_SYMBOL(dcache_dir_close); 835EXPORT_SYMBOL(dcache_dir_close);
811EXPORT_SYMBOL(dcache_dir_lseek); 836EXPORT_SYMBOL(dcache_dir_lseek);
812EXPORT_SYMBOL(dcache_dir_open); 837EXPORT_SYMBOL(dcache_dir_open);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d4946c4c90e2..e5f206467e40 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ static int minix_readdir(struct file *, void *, filldir_t);
22const struct file_operations minix_dir_operations = { 22const struct file_operations minix_dir_operations = {
23 .read = generic_read_dir, 23 .read = generic_read_dir,
24 .readdir = minix_readdir, 24 .readdir = minix_readdir,
25 .fsync = minix_sync_file, 25 .fsync = simple_fsync,
26}; 26};
27 27
28static inline void dir_put_page(struct page *page) 28static inline void dir_put_page(struct page *page)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 17765f697e50..3eec3e607a87 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -6,15 +6,12 @@
6 * minix regular file handling primitives 6 * minix regular file handling primitives
7 */ 7 */
8 8
9#include <linux/buffer_head.h> /* for fsync_inode_buffers() */
10#include "minix.h" 9#include "minix.h"
11 10
12/* 11/*
13 * We have mostly NULLs here: the current defaults are OK for 12 * We have mostly NULLs here: the current defaults are OK for
14 * the minix filesystem. 13 * the minix filesystem.
15 */ 14 */
16int minix_sync_file(struct file *, struct dentry *, int);
17
18const struct file_operations minix_file_operations = { 15const struct file_operations minix_file_operations = {
19 .llseek = generic_file_llseek, 16 .llseek = generic_file_llseek,
20 .read = do_sync_read, 17 .read = do_sync_read,
@@ -22,7 +19,7 @@ const struct file_operations minix_file_operations = {
22 .write = do_sync_write, 19 .write = do_sync_write,
23 .aio_write = generic_file_aio_write, 20 .aio_write = generic_file_aio_write,
24 .mmap = generic_file_mmap, 21 .mmap = generic_file_mmap,
25 .fsync = minix_sync_file, 22 .fsync = simple_fsync,
26 .splice_read = generic_file_splice_read, 23 .splice_read = generic_file_splice_read,
27}; 24};
28 25
@@ -30,18 +27,3 @@ const struct inode_operations minix_file_inode_operations = {
30 .truncate = minix_truncate, 27 .truncate = minix_truncate,
31 .getattr = minix_getattr, 28 .getattr = minix_getattr,
32}; 29};
33
34int minix_sync_file(struct file * file, struct dentry *dentry, int datasync)
35{
36 struct inode *inode = dentry->d_inode;
37 int err;
38
39 err = sync_mapping_buffers(inode->i_mapping);
40 if (!(inode->i_state & I_DIRTY))
41 return err;
42 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
43 return err;
44
45 err |= minix_sync_inode(inode);
46 return err ? -EIO : 0;
47}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index daad3c2740db..f91a23693597 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -35,6 +35,8 @@ static void minix_put_super(struct super_block *sb)
35 int i; 35 int i;
36 struct minix_sb_info *sbi = minix_sb(sb); 36 struct minix_sb_info *sbi = minix_sb(sb);
37 37
38 lock_kernel();
39
38 if (!(sb->s_flags & MS_RDONLY)) { 40 if (!(sb->s_flags & MS_RDONLY)) {
39 if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ 41 if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
40 sbi->s_ms->s_state = sbi->s_mount_state; 42 sbi->s_ms->s_state = sbi->s_mount_state;
@@ -49,7 +51,7 @@ static void minix_put_super(struct super_block *sb)
49 sb->s_fs_info = NULL; 51 sb->s_fs_info = NULL;
50 kfree(sbi); 52 kfree(sbi);
51 53
52 return; 54 unlock_kernel();
53} 55}
54 56
55static struct kmem_cache * minix_inode_cachep; 57static struct kmem_cache * minix_inode_cachep;
@@ -554,38 +556,25 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
554 return bh; 556 return bh;
555} 557}
556 558
557static struct buffer_head *minix_update_inode(struct inode *inode) 559static int minix_write_inode(struct inode *inode, int wait)
558{
559 if (INODE_VERSION(inode) == MINIX_V1)
560 return V1_minix_update_inode(inode);
561 else
562 return V2_minix_update_inode(inode);
563}
564
565static int minix_write_inode(struct inode * inode, int wait)
566{
567 brelse(minix_update_inode(inode));
568 return 0;
569}
570
571int minix_sync_inode(struct inode * inode)
572{ 560{
573 int err = 0; 561 int err = 0;
574 struct buffer_head *bh; 562 struct buffer_head *bh;
575 563
576 bh = minix_update_inode(inode); 564 if (INODE_VERSION(inode) == MINIX_V1)
577 if (bh && buffer_dirty(bh)) 565 bh = V1_minix_update_inode(inode);
578 { 566 else
567 bh = V2_minix_update_inode(inode);
568 if (!bh)
569 return -EIO;
570 if (wait && buffer_dirty(bh)) {
579 sync_dirty_buffer(bh); 571 sync_dirty_buffer(bh);
580 if (buffer_req(bh) && !buffer_uptodate(bh)) 572 if (buffer_req(bh) && !buffer_uptodate(bh)) {
581 {
582 printk("IO error syncing minix inode [%s:%08lx]\n", 573 printk("IO error syncing minix inode [%s:%08lx]\n",
583 inode->i_sb->s_id, inode->i_ino); 574 inode->i_sb->s_id, inode->i_ino);
584 err = -1; 575 err = -EIO;
585 } 576 }
586 } 577 }
587 else if (!bh)
588 err = -1;
589 brelse (bh); 578 brelse (bh);
590 return err; 579 return err;
591} 580}
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index e6a0b193bea4..cb7fdd11f9a5 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -57,7 +57,6 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping,
57extern void V1_minix_truncate(struct inode *); 57extern void V1_minix_truncate(struct inode *);
58extern void V2_minix_truncate(struct inode *); 58extern void V2_minix_truncate(struct inode *);
59extern void minix_truncate(struct inode *); 59extern void minix_truncate(struct inode *);
60extern int minix_sync_inode(struct inode *);
61extern void minix_set_inode(struct inode *, dev_t); 60extern void minix_set_inode(struct inode *, dev_t);
62extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int); 61extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
63extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); 62extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
@@ -72,7 +71,6 @@ extern int minix_empty_dir(struct inode*);
72extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*); 71extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
73extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); 72extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
74extern ino_t minix_inode_by_name(struct dentry*); 73extern ino_t minix_inode_by_name(struct dentry*);
75extern int minix_sync_file(struct file *, struct dentry *, int);
76 74
77extern const struct inode_operations minix_file_inode_operations; 75extern const struct inode_operations minix_file_inode_operations;
78extern const struct inode_operations minix_dir_inode_operations; 76extern const struct inode_operations minix_dir_inode_operations;
diff --git a/fs/mpage.c b/fs/mpage.c
index 680ba60863ff..42381bd6543b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
379 struct buffer_head map_bh; 379 struct buffer_head map_bh;
380 unsigned long first_logical_block = 0; 380 unsigned long first_logical_block = 0;
381 381
382 clear_buffer_mapped(&map_bh); 382 map_bh.b_state = 0;
383 map_bh.b_size = 0;
383 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 384 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
384 struct page *page = list_entry(pages->prev, struct page, lru); 385 struct page *page = list_entry(pages->prev, struct page, lru);
385 386
@@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
412 struct buffer_head map_bh; 413 struct buffer_head map_bh;
413 unsigned long first_logical_block = 0; 414 unsigned long first_logical_block = 0;
414 415
415 clear_buffer_mapped(&map_bh); 416 map_bh.b_state = 0;
417 map_bh.b_size = 0;
416 bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, 418 bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
417 &map_bh, &first_logical_block, get_block); 419 &map_bh, &first_logical_block, get_block);
418 if (bio) 420 if (bio)
diff --git a/fs/namei.c b/fs/namei.c
index 967c3db92724..527119afb6a5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -552,6 +552,17 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
552 return result; 552 return result;
553} 553}
554 554
555static __always_inline void set_root(struct nameidata *nd)
556{
557 if (!nd->root.mnt) {
558 struct fs_struct *fs = current->fs;
559 read_lock(&fs->lock);
560 nd->root = fs->root;
561 path_get(&nd->root);
562 read_unlock(&fs->lock);
563 }
564}
565
555static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 566static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
556{ 567{
557 int res = 0; 568 int res = 0;
@@ -560,14 +571,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
560 goto fail; 571 goto fail;
561 572
562 if (*link == '/') { 573 if (*link == '/') {
563 struct fs_struct *fs = current->fs; 574 set_root(nd);
564
565 path_put(&nd->path); 575 path_put(&nd->path);
566 576 nd->path = nd->root;
567 read_lock(&fs->lock); 577 path_get(&nd->root);
568 nd->path = fs->root;
569 path_get(&fs->root);
570 read_unlock(&fs->lock);
571 } 578 }
572 579
573 res = link_path_walk(link, nd); 580 res = link_path_walk(link, nd);
@@ -668,23 +675,23 @@ loop:
668 return err; 675 return err;
669} 676}
670 677
671int follow_up(struct vfsmount **mnt, struct dentry **dentry) 678int follow_up(struct path *path)
672{ 679{
673 struct vfsmount *parent; 680 struct vfsmount *parent;
674 struct dentry *mountpoint; 681 struct dentry *mountpoint;
675 spin_lock(&vfsmount_lock); 682 spin_lock(&vfsmount_lock);
676 parent=(*mnt)->mnt_parent; 683 parent = path->mnt->mnt_parent;
677 if (parent == *mnt) { 684 if (parent == path->mnt) {
678 spin_unlock(&vfsmount_lock); 685 spin_unlock(&vfsmount_lock);
679 return 0; 686 return 0;
680 } 687 }
681 mntget(parent); 688 mntget(parent);
682 mountpoint=dget((*mnt)->mnt_mountpoint); 689 mountpoint = dget(path->mnt->mnt_mountpoint);
683 spin_unlock(&vfsmount_lock); 690 spin_unlock(&vfsmount_lock);
684 dput(*dentry); 691 dput(path->dentry);
685 *dentry = mountpoint; 692 path->dentry = mountpoint;
686 mntput(*mnt); 693 mntput(path->mnt);
687 *mnt = parent; 694 path->mnt = parent;
688 return 1; 695 return 1;
689} 696}
690 697
@@ -695,7 +702,7 @@ static int __follow_mount(struct path *path)
695{ 702{
696 int res = 0; 703 int res = 0;
697 while (d_mountpoint(path->dentry)) { 704 while (d_mountpoint(path->dentry)) {
698 struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry); 705 struct vfsmount *mounted = lookup_mnt(path);
699 if (!mounted) 706 if (!mounted)
700 break; 707 break;
701 dput(path->dentry); 708 dput(path->dentry);
@@ -708,32 +715,32 @@ static int __follow_mount(struct path *path)
708 return res; 715 return res;
709} 716}
710 717
711static void follow_mount(struct vfsmount **mnt, struct dentry **dentry) 718static void follow_mount(struct path *path)
712{ 719{
713 while (d_mountpoint(*dentry)) { 720 while (d_mountpoint(path->dentry)) {
714 struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); 721 struct vfsmount *mounted = lookup_mnt(path);
715 if (!mounted) 722 if (!mounted)
716 break; 723 break;
717 dput(*dentry); 724 dput(path->dentry);
718 mntput(*mnt); 725 mntput(path->mnt);
719 *mnt = mounted; 726 path->mnt = mounted;
720 *dentry = dget(mounted->mnt_root); 727 path->dentry = dget(mounted->mnt_root);
721 } 728 }
722} 729}
723 730
724/* no need for dcache_lock, as serialization is taken care in 731/* no need for dcache_lock, as serialization is taken care in
725 * namespace.c 732 * namespace.c
726 */ 733 */
727int follow_down(struct vfsmount **mnt, struct dentry **dentry) 734int follow_down(struct path *path)
728{ 735{
729 struct vfsmount *mounted; 736 struct vfsmount *mounted;
730 737
731 mounted = lookup_mnt(*mnt, *dentry); 738 mounted = lookup_mnt(path);
732 if (mounted) { 739 if (mounted) {
733 dput(*dentry); 740 dput(path->dentry);
734 mntput(*mnt); 741 mntput(path->mnt);
735 *mnt = mounted; 742 path->mnt = mounted;
736 *dentry = dget(mounted->mnt_root); 743 path->dentry = dget(mounted->mnt_root);
737 return 1; 744 return 1;
738 } 745 }
739 return 0; 746 return 0;
@@ -741,19 +748,16 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
741 748
742static __always_inline void follow_dotdot(struct nameidata *nd) 749static __always_inline void follow_dotdot(struct nameidata *nd)
743{ 750{
744 struct fs_struct *fs = current->fs; 751 set_root(nd);
745 752
746 while(1) { 753 while(1) {
747 struct vfsmount *parent; 754 struct vfsmount *parent;
748 struct dentry *old = nd->path.dentry; 755 struct dentry *old = nd->path.dentry;
749 756
750 read_lock(&fs->lock); 757 if (nd->path.dentry == nd->root.dentry &&
751 if (nd->path.dentry == fs->root.dentry && 758 nd->path.mnt == nd->root.mnt) {
752 nd->path.mnt == fs->root.mnt) {
753 read_unlock(&fs->lock);
754 break; 759 break;
755 } 760 }
756 read_unlock(&fs->lock);
757 spin_lock(&dcache_lock); 761 spin_lock(&dcache_lock);
758 if (nd->path.dentry != nd->path.mnt->mnt_root) { 762 if (nd->path.dentry != nd->path.mnt->mnt_root) {
759 nd->path.dentry = dget(nd->path.dentry->d_parent); 763 nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -775,7 +779,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
775 mntput(nd->path.mnt); 779 mntput(nd->path.mnt);
776 nd->path.mnt = parent; 780 nd->path.mnt = parent;
777 } 781 }
778 follow_mount(&nd->path.mnt, &nd->path.dentry); 782 follow_mount(&nd->path);
779} 783}
780 784
781/* 785/*
@@ -853,7 +857,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
853 err = inode_permission(nd->path.dentry->d_inode, 857 err = inode_permission(nd->path.dentry->d_inode,
854 MAY_EXEC); 858 MAY_EXEC);
855 if (!err) 859 if (!err)
856 err = ima_path_check(&nd->path, MAY_EXEC); 860 err = ima_path_check(&nd->path, MAY_EXEC,
861 IMA_COUNT_UPDATE);
857 if (err) 862 if (err)
858 break; 863 break;
859 864
@@ -1016,25 +1021,23 @@ static int path_walk(const char *name, struct nameidata *nd)
1016 return link_path_walk(name, nd); 1021 return link_path_walk(name, nd);
1017} 1022}
1018 1023
1019/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1024static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1020static int do_path_lookup(int dfd, const char *name,
1021 unsigned int flags, struct nameidata *nd)
1022{ 1025{
1023 int retval = 0; 1026 int retval = 0;
1024 int fput_needed; 1027 int fput_needed;
1025 struct file *file; 1028 struct file *file;
1026 struct fs_struct *fs = current->fs;
1027 1029
1028 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1030 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1029 nd->flags = flags; 1031 nd->flags = flags;
1030 nd->depth = 0; 1032 nd->depth = 0;
1033 nd->root.mnt = NULL;
1031 1034
1032 if (*name=='/') { 1035 if (*name=='/') {
1033 read_lock(&fs->lock); 1036 set_root(nd);
1034 nd->path = fs->root; 1037 nd->path = nd->root;
1035 path_get(&fs->root); 1038 path_get(&nd->root);
1036 read_unlock(&fs->lock);
1037 } else if (dfd == AT_FDCWD) { 1039 } else if (dfd == AT_FDCWD) {
1040 struct fs_struct *fs = current->fs;
1038 read_lock(&fs->lock); 1041 read_lock(&fs->lock);
1039 nd->path = fs->pwd; 1042 nd->path = fs->pwd;
1040 path_get(&fs->pwd); 1043 path_get(&fs->pwd);
@@ -1062,17 +1065,29 @@ static int do_path_lookup(int dfd, const char *name,
1062 1065
1063 fput_light(file, fput_needed); 1066 fput_light(file, fput_needed);
1064 } 1067 }
1068 return 0;
1065 1069
1066 retval = path_walk(name, nd); 1070fput_fail:
1071 fput_light(file, fput_needed);
1072out_fail:
1073 return retval;
1074}
1075
1076/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1077static int do_path_lookup(int dfd, const char *name,
1078 unsigned int flags, struct nameidata *nd)
1079{
1080 int retval = path_init(dfd, name, flags, nd);
1081 if (!retval)
1082 retval = path_walk(name, nd);
1067 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1083 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1068 nd->path.dentry->d_inode)) 1084 nd->path.dentry->d_inode))
1069 audit_inode(name, nd->path.dentry); 1085 audit_inode(name, nd->path.dentry);
1070out_fail: 1086 if (nd->root.mnt) {
1087 path_put(&nd->root);
1088 nd->root.mnt = NULL;
1089 }
1071 return retval; 1090 return retval;
1072
1073fput_fail:
1074 fput_light(file, fput_needed);
1075 goto out_fail;
1076} 1091}
1077 1092
1078int path_lookup(const char *name, unsigned int flags, 1093int path_lookup(const char *name, unsigned int flags,
@@ -1112,14 +1127,18 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1112 nd->path.dentry = dentry; 1127 nd->path.dentry = dentry;
1113 nd->path.mnt = mnt; 1128 nd->path.mnt = mnt;
1114 path_get(&nd->path); 1129 path_get(&nd->path);
1130 nd->root = nd->path;
1131 path_get(&nd->root);
1115 1132
1116 retval = path_walk(name, nd); 1133 retval = path_walk(name, nd);
1117 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1134 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1118 nd->path.dentry->d_inode)) 1135 nd->path.dentry->d_inode))
1119 audit_inode(name, nd->path.dentry); 1136 audit_inode(name, nd->path.dentry);
1120 1137
1121 return retval; 1138 path_put(&nd->root);
1139 nd->root.mnt = NULL;
1122 1140
1141 return retval;
1123} 1142}
1124 1143
1125/** 1144/**
@@ -1515,7 +1534,8 @@ int may_open(struct path *path, int acc_mode, int flag)
1515 return error; 1534 return error;
1516 1535
1517 error = ima_path_check(path, 1536 error = ima_path_check(path,
1518 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); 1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
1538 IMA_COUNT_UPDATE);
1519 if (error) 1539 if (error)
1520 return error; 1540 return error;
1521 /* 1541 /*
@@ -1674,9 +1694,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
1674 /* 1694 /*
1675 * Create - we need to know the parent. 1695 * Create - we need to know the parent.
1676 */ 1696 */
1677 error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 1697 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
1678 if (error) 1698 if (error)
1679 return ERR_PTR(error); 1699 return ERR_PTR(error);
1700 error = path_walk(pathname, &nd);
1701 if (error)
1702 return ERR_PTR(error);
1703 if (unlikely(!audit_dummy_context()))
1704 audit_inode(pathname, nd.path.dentry);
1680 1705
1681 /* 1706 /*
1682 * We have the parent and last component. First of all, check 1707 * We have the parent and last component. First of all, check
@@ -1804,6 +1829,8 @@ exit:
1804 if (!IS_ERR(nd.intent.open.file)) 1829 if (!IS_ERR(nd.intent.open.file))
1805 release_open_intent(&nd); 1830 release_open_intent(&nd);
1806exit_parent: 1831exit_parent:
1832 if (nd.root.mnt)
1833 path_put(&nd.root);
1807 path_put(&nd.path); 1834 path_put(&nd.path);
1808 return ERR_PTR(error); 1835 return ERR_PTR(error);
1809 1836
diff --git a/fs/namespace.c b/fs/namespace.c
index 134d494158d9..2dd333b0fe7f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
131 INIT_LIST_HEAD(&mnt->mnt_share); 131 INIT_LIST_HEAD(&mnt->mnt_share);
132 INIT_LIST_HEAD(&mnt->mnt_slave_list); 132 INIT_LIST_HEAD(&mnt->mnt_slave_list);
133 INIT_LIST_HEAD(&mnt->mnt_slave); 133 INIT_LIST_HEAD(&mnt->mnt_slave);
134 atomic_set(&mnt->__mnt_writers, 0); 134#ifdef CONFIG_SMP
135 mnt->mnt_writers = alloc_percpu(int);
136 if (!mnt->mnt_writers)
137 goto out_free_devname;
138#else
139 mnt->mnt_writers = 0;
140#endif
135 } 141 }
136 return mnt; 142 return mnt;
137 143
144#ifdef CONFIG_SMP
145out_free_devname:
146 kfree(mnt->mnt_devname);
147#endif
138out_free_id: 148out_free_id:
139 mnt_free_id(mnt); 149 mnt_free_id(mnt);
140out_free_cache: 150out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
171} 181}
172EXPORT_SYMBOL_GPL(__mnt_is_readonly); 182EXPORT_SYMBOL_GPL(__mnt_is_readonly);
173 183
174struct mnt_writer { 184static inline void inc_mnt_writers(struct vfsmount *mnt)
175 /* 185{
176 * If holding multiple instances of this lock, they 186#ifdef CONFIG_SMP
177 * must be ordered by cpu number. 187 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
178 */ 188#else
179 spinlock_t lock; 189 mnt->mnt_writers++;
180 struct lock_class_key lock_class; /* compiles out with !lockdep */ 190#endif
181 unsigned long count; 191}
182 struct vfsmount *mnt;
183} ____cacheline_aligned_in_smp;
184static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
185 192
186static int __init init_mnt_writers(void) 193static inline void dec_mnt_writers(struct vfsmount *mnt)
187{ 194{
188 int cpu; 195#ifdef CONFIG_SMP
189 for_each_possible_cpu(cpu) { 196 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
190 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); 197#else
191 spin_lock_init(&writer->lock); 198 mnt->mnt_writers--;
192 lockdep_set_class(&writer->lock, &writer->lock_class); 199#endif
193 writer->count = 0;
194 }
195 return 0;
196} 200}
197fs_initcall(init_mnt_writers);
198 201
199static void unlock_mnt_writers(void) 202static unsigned int count_mnt_writers(struct vfsmount *mnt)
200{ 203{
204#ifdef CONFIG_SMP
205 unsigned int count = 0;
201 int cpu; 206 int cpu;
202 struct mnt_writer *cpu_writer;
203 207
204 for_each_possible_cpu(cpu) { 208 for_each_possible_cpu(cpu) {
205 cpu_writer = &per_cpu(mnt_writers, cpu); 209 count += *per_cpu_ptr(mnt->mnt_writers, cpu);
206 spin_unlock(&cpu_writer->lock);
207 } 210 }
208}
209 211
210static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) 212 return count;
211{ 213#else
212 if (!cpu_writer->mnt) 214 return mnt->mnt_writers;
213 return; 215#endif
214 /*
215 * This is in case anyone ever leaves an invalid,
216 * old ->mnt and a count of 0.
217 */
218 if (!cpu_writer->count)
219 return;
220 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
221 cpu_writer->count = 0;
222}
223 /*
224 * must hold cpu_writer->lock
225 */
226static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
227 struct vfsmount *mnt)
228{
229 if (cpu_writer->mnt == mnt)
230 return;
231 __clear_mnt_count(cpu_writer);
232 cpu_writer->mnt = mnt;
233} 216}
234 217
235/* 218/*
@@ -253,74 +236,73 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
253int mnt_want_write(struct vfsmount *mnt) 236int mnt_want_write(struct vfsmount *mnt)
254{ 237{
255 int ret = 0; 238 int ret = 0;
256 struct mnt_writer *cpu_writer;
257 239
258 cpu_writer = &get_cpu_var(mnt_writers); 240 preempt_disable();
259 spin_lock(&cpu_writer->lock); 241 inc_mnt_writers(mnt);
242 /*
243 * The store to inc_mnt_writers must be visible before we pass
244 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
245 * incremented count after it has set MNT_WRITE_HOLD.
246 */
247 smp_mb();
248 while (mnt->mnt_flags & MNT_WRITE_HOLD)
249 cpu_relax();
250 /*
251 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
252 * be set to match its requirements. So we must not load that until
253 * MNT_WRITE_HOLD is cleared.
254 */
255 smp_rmb();
260 if (__mnt_is_readonly(mnt)) { 256 if (__mnt_is_readonly(mnt)) {
257 dec_mnt_writers(mnt);
261 ret = -EROFS; 258 ret = -EROFS;
262 goto out; 259 goto out;
263 } 260 }
264 use_cpu_writer_for_mount(cpu_writer, mnt);
265 cpu_writer->count++;
266out: 261out:
267 spin_unlock(&cpu_writer->lock); 262 preempt_enable();
268 put_cpu_var(mnt_writers);
269 return ret; 263 return ret;
270} 264}
271EXPORT_SYMBOL_GPL(mnt_want_write); 265EXPORT_SYMBOL_GPL(mnt_want_write);
272 266
273static void lock_mnt_writers(void) 267/**
274{ 268 * mnt_clone_write - get write access to a mount
275 int cpu; 269 * @mnt: the mount on which to take a write
276 struct mnt_writer *cpu_writer; 270 *
277 271 * This is effectively like mnt_want_write, except
278 for_each_possible_cpu(cpu) { 272 * it must only be used to take an extra write reference
279 cpu_writer = &per_cpu(mnt_writers, cpu); 273 * on a mountpoint that we already know has a write reference
280 spin_lock(&cpu_writer->lock); 274 * on it. This allows some optimisation.
281 __clear_mnt_count(cpu_writer); 275 *
282 cpu_writer->mnt = NULL; 276 * After finished, mnt_drop_write must be called as usual to
283 } 277 * drop the reference.
278 */
279int mnt_clone_write(struct vfsmount *mnt)
280{
281 /* superblock may be r/o */
282 if (__mnt_is_readonly(mnt))
283 return -EROFS;
284 preempt_disable();
285 inc_mnt_writers(mnt);
286 preempt_enable();
287 return 0;
284} 288}
289EXPORT_SYMBOL_GPL(mnt_clone_write);
285 290
286/* 291/**
287 * These per-cpu write counts are not guaranteed to have 292 * mnt_want_write_file - get write access to a file's mount
288 * matched increments and decrements on any given cpu. 293 * @file: the file who's mount on which to take a write
289 * A file open()ed for write on one cpu and close()d on 294 *
290 * another cpu will imbalance this count. Make sure it 295 * This is like mnt_want_write, but it takes a file and can
291 * does not get too far out of whack. 296 * do some optimisations if the file is open for write already
292 */ 297 */
293static void handle_write_count_underflow(struct vfsmount *mnt) 298int mnt_want_write_file(struct file *file)
294{ 299{
295 if (atomic_read(&mnt->__mnt_writers) >= 300 if (!(file->f_mode & FMODE_WRITE))
296 MNT_WRITER_UNDERFLOW_LIMIT) 301 return mnt_want_write(file->f_path.mnt);
297 return; 302 else
298 /* 303 return mnt_clone_write(file->f_path.mnt);
299 * It isn't necessary to hold all of the locks
300 * at the same time, but doing it this way makes
301 * us share a lot more code.
302 */
303 lock_mnt_writers();
304 /*
305 * vfsmount_lock is for mnt_flags.
306 */
307 spin_lock(&vfsmount_lock);
308 /*
309 * If coalescing the per-cpu writer counts did not
310 * get us back to a positive writer count, we have
311 * a bug.
312 */
313 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
314 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
315 WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
316 "count: %d\n",
317 mnt, atomic_read(&mnt->__mnt_writers));
318 /* use the flag to keep the dmesg spam down */
319 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
320 }
321 spin_unlock(&vfsmount_lock);
322 unlock_mnt_writers();
323} 304}
305EXPORT_SYMBOL_GPL(mnt_want_write_file);
324 306
325/** 307/**
326 * mnt_drop_write - give up write access to a mount 308 * mnt_drop_write - give up write access to a mount
@@ -332,37 +314,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
332 */ 314 */
333void mnt_drop_write(struct vfsmount *mnt) 315void mnt_drop_write(struct vfsmount *mnt)
334{ 316{
335 int must_check_underflow = 0; 317 preempt_disable();
336 struct mnt_writer *cpu_writer; 318 dec_mnt_writers(mnt);
337 319 preempt_enable();
338 cpu_writer = &get_cpu_var(mnt_writers);
339 spin_lock(&cpu_writer->lock);
340
341 use_cpu_writer_for_mount(cpu_writer, mnt);
342 if (cpu_writer->count > 0) {
343 cpu_writer->count--;
344 } else {
345 must_check_underflow = 1;
346 atomic_dec(&mnt->__mnt_writers);
347 }
348
349 spin_unlock(&cpu_writer->lock);
350 /*
351 * Logically, we could call this each time,
352 * but the __mnt_writers cacheline tends to
353 * be cold, and makes this expensive.
354 */
355 if (must_check_underflow)
356 handle_write_count_underflow(mnt);
357 /*
358 * This could be done right after the spinlock
359 * is taken because the spinlock keeps us on
360 * the cpu, and disables preemption. However,
361 * putting it here bounds the amount that
362 * __mnt_writers can underflow. Without it,
363 * we could theoretically wrap __mnt_writers.
364 */
365 put_cpu_var(mnt_writers);
366} 320}
367EXPORT_SYMBOL_GPL(mnt_drop_write); 321EXPORT_SYMBOL_GPL(mnt_drop_write);
368 322
@@ -370,24 +324,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
370{ 324{
371 int ret = 0; 325 int ret = 0;
372 326
373 lock_mnt_writers(); 327 spin_lock(&vfsmount_lock);
328 mnt->mnt_flags |= MNT_WRITE_HOLD;
374 /* 329 /*
375 * With all the locks held, this value is stable 330 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
331 * should be visible before we do.
376 */ 332 */
377 if (atomic_read(&mnt->__mnt_writers) > 0) { 333 smp_mb();
378 ret = -EBUSY; 334
379 goto out;
380 }
381 /* 335 /*
382 * nobody can do a successful mnt_want_write() with all 336 * With writers on hold, if this value is zero, then there are
383 * of the counts in MNT_DENIED_WRITE and the locks held. 337 * definitely no active writers (although held writers may subsequently
338 * increment the count, they'll have to wait, and decrement it after
339 * seeing MNT_READONLY).
340 *
341 * It is OK to have counter incremented on one CPU and decremented on
342 * another: the sum will add up correctly. The danger would be when we
343 * sum up each counter, if we read a counter before it is incremented,
344 * but then read another CPU's count which it has been subsequently
345 * decremented from -- we would see more decrements than we should.
346 * MNT_WRITE_HOLD protects against this scenario, because
347 * mnt_want_write first increments count, then smp_mb, then spins on
348 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
349 * we're counting up here.
384 */ 350 */
385 spin_lock(&vfsmount_lock); 351 if (count_mnt_writers(mnt) > 0)
386 if (!ret) 352 ret = -EBUSY;
353 else
387 mnt->mnt_flags |= MNT_READONLY; 354 mnt->mnt_flags |= MNT_READONLY;
355 /*
356 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
357 * that become unheld will see MNT_READONLY.
358 */
359 smp_wmb();
360 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
388 spin_unlock(&vfsmount_lock); 361 spin_unlock(&vfsmount_lock);
389out:
390 unlock_mnt_writers();
391 return ret; 362 return ret;
392} 363}
393 364
@@ -410,6 +381,9 @@ void free_vfsmnt(struct vfsmount *mnt)
410{ 381{
411 kfree(mnt->mnt_devname); 382 kfree(mnt->mnt_devname);
412 mnt_free_id(mnt); 383 mnt_free_id(mnt);
384#ifdef CONFIG_SMP
385 free_percpu(mnt->mnt_writers);
386#endif
413 kmem_cache_free(mnt_cache, mnt); 387 kmem_cache_free(mnt_cache, mnt);
414} 388}
415 389
@@ -442,11 +416,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
442 * lookup_mnt increments the ref count before returning 416 * lookup_mnt increments the ref count before returning
443 * the vfsmount struct. 417 * the vfsmount struct.
444 */ 418 */
445struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 419struct vfsmount *lookup_mnt(struct path *path)
446{ 420{
447 struct vfsmount *child_mnt; 421 struct vfsmount *child_mnt;
448 spin_lock(&vfsmount_lock); 422 spin_lock(&vfsmount_lock);
449 if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) 423 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
450 mntget(child_mnt); 424 mntget(child_mnt);
451 spin_unlock(&vfsmount_lock); 425 spin_unlock(&vfsmount_lock);
452 return child_mnt; 426 return child_mnt;
@@ -604,38 +578,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
604 578
605static inline void __mntput(struct vfsmount *mnt) 579static inline void __mntput(struct vfsmount *mnt)
606{ 580{
607 int cpu;
608 struct super_block *sb = mnt->mnt_sb; 581 struct super_block *sb = mnt->mnt_sb;
609 /* 582 /*
610 * We don't have to hold all of the locks at the
611 * same time here because we know that we're the
612 * last reference to mnt and that no new writers
613 * can come in.
614 */
615 for_each_possible_cpu(cpu) {
616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617 spin_lock(&cpu_writer->lock);
618 if (cpu_writer->mnt != mnt) {
619 spin_unlock(&cpu_writer->lock);
620 continue;
621 }
622 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
623 cpu_writer->count = 0;
624 /*
625 * Might as well do this so that no one
626 * ever sees the pointer and expects
627 * it to be valid.
628 */
629 cpu_writer->mnt = NULL;
630 spin_unlock(&cpu_writer->lock);
631 }
632 /*
633 * This probably indicates that somebody messed 583 * This probably indicates that somebody messed
634 * up a mnt_want/drop_write() pair. If this 584 * up a mnt_want/drop_write() pair. If this
635 * happens, the filesystem was probably unable 585 * happens, the filesystem was probably unable
636 * to make r/w->r/o transitions. 586 * to make r/w->r/o transitions.
637 */ 587 */
638 WARN_ON(atomic_read(&mnt->__mnt_writers)); 588 /*
589 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
590 * provides barriers, so count_mnt_writers() below is safe. AV
591 */
592 WARN_ON(count_mnt_writers(mnt));
639 dput(mnt->mnt_root); 593 dput(mnt->mnt_root);
640 free_vfsmnt(mnt); 594 free_vfsmnt(mnt);
641 deactivate_super(sb); 595 deactivate_super(sb);
@@ -1106,11 +1060,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
1106 * we just try to remount it readonly. 1060 * we just try to remount it readonly.
1107 */ 1061 */
1108 down_write(&sb->s_umount); 1062 down_write(&sb->s_umount);
1109 if (!(sb->s_flags & MS_RDONLY)) { 1063 if (!(sb->s_flags & MS_RDONLY))
1110 lock_kernel();
1111 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1064 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
1112 unlock_kernel();
1113 }
1114 up_write(&sb->s_umount); 1065 up_write(&sb->s_umount);
1115 return retval; 1066 return retval;
1116 } 1067 }
@@ -1253,11 +1204,11 @@ Enomem:
1253 return NULL; 1204 return NULL;
1254} 1205}
1255 1206
1256struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) 1207struct vfsmount *collect_mounts(struct path *path)
1257{ 1208{
1258 struct vfsmount *tree; 1209 struct vfsmount *tree;
1259 down_write(&namespace_sem); 1210 down_write(&namespace_sem);
1260 tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); 1211 tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
1261 up_write(&namespace_sem); 1212 up_write(&namespace_sem);
1262 return tree; 1213 return tree;
1263} 1214}
@@ -1430,7 +1381,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1430 goto out_unlock; 1381 goto out_unlock;
1431 1382
1432 err = -ENOENT; 1383 err = -ENOENT;
1433 if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry)) 1384 if (!d_unlinked(path->dentry))
1434 err = attach_recursive_mnt(mnt, path, NULL); 1385 err = attach_recursive_mnt(mnt, path, NULL);
1435out_unlock: 1386out_unlock:
1436 mutex_unlock(&path->dentry->d_inode->i_mutex); 1387 mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1601,7 +1552,7 @@ static int do_move_mount(struct path *path, char *old_name)
1601 1552
1602 down_write(&namespace_sem); 1553 down_write(&namespace_sem);
1603 while (d_mountpoint(path->dentry) && 1554 while (d_mountpoint(path->dentry) &&
1604 follow_down(&path->mnt, &path->dentry)) 1555 follow_down(path))
1605 ; 1556 ;
1606 err = -EINVAL; 1557 err = -EINVAL;
1607 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1558 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1612,7 +1563,7 @@ static int do_move_mount(struct path *path, char *old_name)
1612 if (IS_DEADDIR(path->dentry->d_inode)) 1563 if (IS_DEADDIR(path->dentry->d_inode))
1613 goto out1; 1564 goto out1;
1614 1565
1615 if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) 1566 if (d_unlinked(path->dentry))
1616 goto out1; 1567 goto out1;
1617 1568
1618 err = -EINVAL; 1569 err = -EINVAL;
@@ -1676,7 +1627,9 @@ static int do_new_mount(struct path *path, char *type, int flags,
1676 if (!capable(CAP_SYS_ADMIN)) 1627 if (!capable(CAP_SYS_ADMIN))
1677 return -EPERM; 1628 return -EPERM;
1678 1629
1630 lock_kernel();
1679 mnt = do_kern_mount(type, flags, name, data); 1631 mnt = do_kern_mount(type, flags, name, data);
1632 unlock_kernel();
1680 if (IS_ERR(mnt)) 1633 if (IS_ERR(mnt))
1681 return PTR_ERR(mnt); 1634 return PTR_ERR(mnt);
1682 1635
@@ -1695,10 +1648,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1695 down_write(&namespace_sem); 1648 down_write(&namespace_sem);
1696 /* Something was mounted here while we slept */ 1649 /* Something was mounted here while we slept */
1697 while (d_mountpoint(path->dentry) && 1650 while (d_mountpoint(path->dentry) &&
1698 follow_down(&path->mnt, &path->dentry)) 1651 follow_down(path))
1699 ; 1652 ;
1700 err = -EINVAL; 1653 err = -EINVAL;
1701 if (!check_mnt(path->mnt)) 1654 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1702 goto unlock; 1655 goto unlock;
1703 1656
1704 /* Refuse the same filesystem on the same mount point */ 1657 /* Refuse the same filesystem on the same mount point */
@@ -2092,10 +2045,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2092 if (retval < 0) 2045 if (retval < 0)
2093 goto out3; 2046 goto out3;
2094 2047
2095 lock_kernel();
2096 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 2048 retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
2097 flags, (void *)data_page); 2049 flags, (void *)data_page);
2098 unlock_kernel();
2099 free_page(data_page); 2050 free_page(data_page);
2100 2051
2101out3: 2052out3:
@@ -2175,9 +2126,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2175 error = -ENOENT; 2126 error = -ENOENT;
2176 if (IS_DEADDIR(new.dentry->d_inode)) 2127 if (IS_DEADDIR(new.dentry->d_inode))
2177 goto out2; 2128 goto out2;
2178 if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry)) 2129 if (d_unlinked(new.dentry))
2179 goto out2; 2130 goto out2;
2180 if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry)) 2131 if (d_unlinked(old.dentry))
2181 goto out2; 2132 goto out2;
2182 error = -EBUSY; 2133 error = -EBUSY;
2183 if (new.mnt == root.mnt || 2134 if (new.mnt == root.mnt ||
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d642f0e5b365..b99ce205b1bd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -736,6 +736,8 @@ static void ncp_put_super(struct super_block *sb)
736{ 736{
737 struct ncp_server *server = NCP_SBP(sb); 737 struct ncp_server *server = NCP_SBP(sb);
738 738
739 lock_kernel();
740
739 ncp_lock_server(server); 741 ncp_lock_server(server);
740 ncp_disconnect(server); 742 ncp_disconnect(server);
741 ncp_unlock_server(server); 743 ncp_unlock_server(server);
@@ -769,6 +771,8 @@ static void ncp_put_super(struct super_block *sb)
769 vfree(server->packet); 771 vfree(server->packet);
770 sb->s_fs_info = NULL; 772 sb->s_fs_info = NULL;
771 kfree(server); 773 kfree(server);
774
775 unlock_kernel();
772} 776}
773 777
774static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) 778static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 64a288ee046d..f01caec84463 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -154,7 +154,7 @@ out_err:
154 goto out; 154 goto out;
155out_follow: 155out_follow:
156 while (d_mountpoint(nd->path.dentry) && 156 while (d_mountpoint(nd->path.dentry) &&
157 follow_down(&nd->path.mnt, &nd->path.dentry)) 157 follow_down(&nd->path))
158 ; 158 ;
159 err = 0; 159 err = 0;
160 goto out; 160 goto out;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2d67781c579..26127b69a275 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1813,6 +1813,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1813 if (data == NULL) 1813 if (data == NULL)
1814 return -ENOMEM; 1814 return -ENOMEM;
1815 1815
1816 lock_kernel();
1816 /* fill out struct with values from existing mount */ 1817 /* fill out struct with values from existing mount */
1817 data->flags = nfss->flags; 1818 data->flags = nfss->flags;
1818 data->rsize = nfss->rsize; 1819 data->rsize = nfss->rsize;
@@ -1837,6 +1838,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1837 error = nfs_compare_remount_data(nfss, data); 1838 error = nfs_compare_remount_data(nfss, data);
1838out: 1839out:
1839 kfree(data); 1840 kfree(data);
1841 unlock_kernel();
1840 return error; 1842 return error;
1841} 1843}
1842 1844
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5839b229cd0e..8b1f8efb4690 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -847,9 +847,8 @@ exp_get_fsid_key(svc_client *clp, int fsid)
847 return exp_find_key(clp, FSID_NUM, fsidv, NULL); 847 return exp_find_key(clp, FSID_NUM, fsidv, NULL);
848} 848}
849 849
850static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt, 850static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
851 struct dentry *dentry, 851 struct cache_req *reqp)
852 struct cache_req *reqp)
853{ 852{
854 struct svc_export *exp, key; 853 struct svc_export *exp, key;
855 int err; 854 int err;
@@ -858,8 +857,7 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
858 return ERR_PTR(-ENOENT); 857 return ERR_PTR(-ENOENT);
859 858
860 key.ex_client = clp; 859 key.ex_client = clp;
861 key.ex_path.mnt = mnt; 860 key.ex_path = *path;
862 key.ex_path.dentry = dentry;
863 861
864 exp = svc_export_lookup(&key); 862 exp = svc_export_lookup(&key);
865 if (exp == NULL) 863 if (exp == NULL)
@@ -873,24 +871,19 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
873/* 871/*
874 * Find the export entry for a given dentry. 872 * Find the export entry for a given dentry.
875 */ 873 */
876static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt, 874static struct svc_export *exp_parent(svc_client *clp, struct path *path)
877 struct dentry *dentry,
878 struct cache_req *reqp)
879{ 875{
880 svc_export *exp; 876 struct dentry *saved = dget(path->dentry);
881 877 svc_export *exp = exp_get_by_name(clp, path, NULL);
882 dget(dentry); 878
883 exp = exp_get_by_name(clp, mnt, dentry, reqp); 879 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
884 880 struct dentry *parent = dget_parent(path->dentry);
885 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { 881 dput(path->dentry);
886 struct dentry *parent; 882 path->dentry = parent;
887 883 exp = exp_get_by_name(clp, path, NULL);
888 parent = dget_parent(dentry);
889 dput(dentry);
890 dentry = parent;
891 exp = exp_get_by_name(clp, mnt, dentry, reqp);
892 } 884 }
893 dput(dentry); 885 dput(path->dentry);
886 path->dentry = saved;
894 return exp; 887 return exp;
895} 888}
896 889
@@ -1018,7 +1011,7 @@ exp_export(struct nfsctl_export *nxp)
1018 goto out_put_clp; 1011 goto out_put_clp;
1019 err = -EINVAL; 1012 err = -EINVAL;
1020 1013
1021 exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL); 1014 exp = exp_get_by_name(clp, &path, NULL);
1022 1015
1023 memset(&new, 0, sizeof(new)); 1016 memset(&new, 0, sizeof(new));
1024 1017
@@ -1135,7 +1128,7 @@ exp_unexport(struct nfsctl_export *nxp)
1135 goto out_domain; 1128 goto out_domain;
1136 1129
1137 err = -EINVAL; 1130 err = -EINVAL;
1138 exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL); 1131 exp = exp_get_by_name(dom, &path, NULL);
1139 path_put(&path); 1132 path_put(&path);
1140 if (IS_ERR(exp)) 1133 if (IS_ERR(exp))
1141 goto out_domain; 1134 goto out_domain;
@@ -1177,7 +1170,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
1177 dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", 1170 dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
1178 name, path.dentry, clp->name, 1171 name, path.dentry, clp->name,
1179 inode->i_sb->s_id, inode->i_ino); 1172 inode->i_sb->s_id, inode->i_ino);
1180 exp = exp_parent(clp, path.mnt, path.dentry, NULL); 1173 exp = exp_parent(clp, &path);
1181 if (IS_ERR(exp)) { 1174 if (IS_ERR(exp)) {
1182 err = PTR_ERR(exp); 1175 err = PTR_ERR(exp);
1183 goto out; 1176 goto out;
@@ -1207,7 +1200,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type,
1207 if (IS_ERR(ek)) 1200 if (IS_ERR(ek))
1208 return ERR_CAST(ek); 1201 return ERR_CAST(ek);
1209 1202
1210 exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp); 1203 exp = exp_get_by_name(clp, &ek->ek_path, reqp);
1211 cache_put(&ek->h, &svc_expkey_cache); 1204 cache_put(&ek->h, &svc_expkey_cache);
1212 1205
1213 if (IS_ERR(exp)) 1206 if (IS_ERR(exp))
@@ -1247,8 +1240,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
1247 * use exp_get_by_name() or exp_find(). 1240 * use exp_get_by_name() or exp_find().
1248 */ 1241 */
1249struct svc_export * 1242struct svc_export *
1250rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt, 1243rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
1251 struct dentry *dentry)
1252{ 1244{
1253 struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); 1245 struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
1254 1246
@@ -1256,8 +1248,7 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
1256 goto gss; 1248 goto gss;
1257 1249
1258 /* First try the auth_unix client: */ 1250 /* First try the auth_unix client: */
1259 exp = exp_get_by_name(rqstp->rq_client, mnt, dentry, 1251 exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
1260 &rqstp->rq_chandle);
1261 if (PTR_ERR(exp) == -ENOENT) 1252 if (PTR_ERR(exp) == -ENOENT)
1262 goto gss; 1253 goto gss;
1263 if (IS_ERR(exp)) 1254 if (IS_ERR(exp))
@@ -1269,8 +1260,7 @@ gss:
1269 /* Otherwise, try falling back on gss client */ 1260 /* Otherwise, try falling back on gss client */
1270 if (rqstp->rq_gssclient == NULL) 1261 if (rqstp->rq_gssclient == NULL)
1271 return exp; 1262 return exp;
1272 gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry, 1263 gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
1273 &rqstp->rq_chandle);
1274 if (PTR_ERR(gssexp) == -ENOENT) 1264 if (PTR_ERR(gssexp) == -ENOENT)
1275 return exp; 1265 return exp;
1276 if (!IS_ERR(exp)) 1266 if (!IS_ERR(exp))
@@ -1309,23 +1299,19 @@ gss:
1309} 1299}
1310 1300
1311struct svc_export * 1301struct svc_export *
1312rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt, 1302rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
1313 struct dentry *dentry)
1314{ 1303{
1315 struct svc_export *exp; 1304 struct dentry *saved = dget(path->dentry);
1316 1305 struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
1317 dget(dentry); 1306
1318 exp = rqst_exp_get_by_name(rqstp, mnt, dentry); 1307 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
1319 1308 struct dentry *parent = dget_parent(path->dentry);
1320 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { 1309 dput(path->dentry);
1321 struct dentry *parent; 1310 path->dentry = parent;
1322 1311 exp = rqst_exp_get_by_name(rqstp, path);
1323 parent = dget_parent(dentry);
1324 dput(dentry);
1325 dentry = parent;
1326 exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
1327 } 1312 }
1328 dput(dentry); 1313 dput(path->dentry);
1314 path->dentry = saved;
1329 return exp; 1315 return exp;
1330} 1316}
1331 1317
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b660435978d2..99f835753596 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -55,6 +55,7 @@
55#include <linux/security.h> 55#include <linux/security.h>
56#endif /* CONFIG_NFSD_V4 */ 56#endif /* CONFIG_NFSD_V4 */
57#include <linux/jhash.h> 57#include <linux/jhash.h>
58#include <linux/ima.h>
58 59
59#include <asm/uaccess.h> 60#include <asm/uaccess.h>
60 61
@@ -100,36 +101,35 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
100{ 101{
101 struct svc_export *exp = *expp, *exp2 = NULL; 102 struct svc_export *exp = *expp, *exp2 = NULL;
102 struct dentry *dentry = *dpp; 103 struct dentry *dentry = *dpp;
103 struct vfsmount *mnt = mntget(exp->ex_path.mnt); 104 struct path path = {.mnt = mntget(exp->ex_path.mnt),
104 struct dentry *mounts = dget(dentry); 105 .dentry = dget(dentry)};
105 int err = 0; 106 int err = 0;
106 107
107 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); 108 while (d_mountpoint(path.dentry) && follow_down(&path))
109 ;
108 110
109 exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts); 111 exp2 = rqst_exp_get_by_name(rqstp, &path);
110 if (IS_ERR(exp2)) { 112 if (IS_ERR(exp2)) {
111 if (PTR_ERR(exp2) != -ENOENT) 113 if (PTR_ERR(exp2) != -ENOENT)
112 err = PTR_ERR(exp2); 114 err = PTR_ERR(exp2);
113 dput(mounts); 115 path_put(&path);
114 mntput(mnt);
115 goto out; 116 goto out;
116 } 117 }
117 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { 118 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
118 /* successfully crossed mount point */ 119 /* successfully crossed mount point */
119 /* 120 /*
120 * This is subtle: dentry is *not* under mnt at this point. 121 * This is subtle: path.dentry is *not* on path.mnt
121 * The only reason we are safe is that original mnt is pinned 122 * at this point. The only reason we are safe is that
122 * down by exp, so we should dput before putting exp. 123 * original mnt is pinned down by exp, so we should
124 * put path *before* putting exp
123 */ 125 */
124 dput(dentry); 126 *dpp = path.dentry;
125 *dpp = mounts; 127 path.dentry = dentry;
126 exp_put(exp);
127 *expp = exp2; 128 *expp = exp2;
128 } else { 129 exp2 = exp;
129 exp_put(exp2);
130 dput(mounts);
131 } 130 }
132 mntput(mnt); 131 path_put(&path);
132 exp_put(exp2);
133out: 133out:
134 return err; 134 return err;
135} 135}
@@ -168,28 +168,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
168 /* checking mountpoint crossing is very different when stepping up */ 168 /* checking mountpoint crossing is very different when stepping up */
169 struct svc_export *exp2 = NULL; 169 struct svc_export *exp2 = NULL;
170 struct dentry *dp; 170 struct dentry *dp;
171 struct vfsmount *mnt = mntget(exp->ex_path.mnt); 171 struct path path = {.mnt = mntget(exp->ex_path.mnt),
172 dentry = dget(dparent); 172 .dentry = dget(dparent)};
173 while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry)) 173
174 while (path.dentry == path.mnt->mnt_root &&
175 follow_up(&path))
174 ; 176 ;
175 dp = dget_parent(dentry); 177 dp = dget_parent(path.dentry);
176 dput(dentry); 178 dput(path.dentry);
177 dentry = dp; 179 path.dentry = dp;
178 180
179 exp2 = rqst_exp_parent(rqstp, mnt, dentry); 181 exp2 = rqst_exp_parent(rqstp, &path);
180 if (PTR_ERR(exp2) == -ENOENT) { 182 if (PTR_ERR(exp2) == -ENOENT) {
181 dput(dentry);
182 dentry = dget(dparent); 183 dentry = dget(dparent);
183 } else if (IS_ERR(exp2)) { 184 } else if (IS_ERR(exp2)) {
184 host_err = PTR_ERR(exp2); 185 host_err = PTR_ERR(exp2);
185 dput(dentry); 186 path_put(&path);
186 mntput(mnt);
187 goto out_nfserr; 187 goto out_nfserr;
188 } else { 188 } else {
189 dentry = dget(path.dentry);
189 exp_put(exp); 190 exp_put(exp);
190 exp = exp2; 191 exp = exp2;
191 } 192 }
192 mntput(mnt); 193 path_put(&path);
193 } 194 }
194 } else { 195 } else {
195 fh_lock(fhp); 196 fh_lock(fhp);
@@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
735 flags, cred); 736 flags, cred);
736 if (IS_ERR(*filp)) 737 if (IS_ERR(*filp))
737 host_err = PTR_ERR(*filp); 738 host_err = PTR_ERR(*filp);
739 else
740 ima_counts_get(*filp);
738out_nfserr: 741out_nfserr:
739 err = nfserrno(host_err); 742 err = nfserrno(host_err);
740out: 743out:
@@ -2024,6 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2024 struct dentry *dentry, int acc) 2027 struct dentry *dentry, int acc)
2025{ 2028{
2026 struct inode *inode = dentry->d_inode; 2029 struct inode *inode = dentry->d_inode;
2030 struct path path;
2027 int err; 2031 int err;
2028 2032
2029 if (acc == NFSD_MAY_NOP) 2033 if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2100,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2096 if (err == -EACCES && S_ISREG(inode->i_mode) && 2100 if (err == -EACCES && S_ISREG(inode->i_mode) &&
2097 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) 2101 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
2098 err = inode_permission(inode, MAY_EXEC); 2102 err = inode_permission(inode, MAY_EXEC);
2103 if (err)
2104 goto nfsd_out;
2099 2105
2106 /* Do integrity (permission) checking now, but defer incrementing
2107 * IMA counts to the actual file open.
2108 */
2109 path.mnt = exp->ex_path.mnt;
2110 path.dentry = dentry;
2111 err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
2112 IMA_COUNT_LEAVE);
2113nfsd_out:
2100 return err? nfserrno(err) : 0; 2114 return err? nfserrno(err) : 0;
2101} 2115}
2102 2116
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 300f1cdfa862..cadd36b14d07 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -864,11 +864,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
864 case NILFS_CHECKPOINT: 864 case NILFS_CHECKPOINT:
865 /* 865 /*
866 * Check for protecting existing snapshot mounts: 866 * Check for protecting existing snapshot mounts:
867 * bd_mount_sem is used to make this operation atomic and 867 * ns_mount_mutex is used to make this operation atomic and
868 * exclusive with a new mount job. Though it doesn't cover 868 * exclusive with a new mount job. Though it doesn't cover
869 * umount, it's enough for the purpose. 869 * umount, it's enough for the purpose.
870 */ 870 */
871 down(&nilfs->ns_bdev->bd_mount_sem); 871 mutex_lock(&nilfs->ns_mount_mutex);
872 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { 872 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
873 /* Current implementation does not have to protect 873 /* Current implementation does not have to protect
874 plain read-only mounts since they are exclusive 874 plain read-only mounts since they are exclusive
@@ -877,7 +877,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
877 ret = -EBUSY; 877 ret = -EBUSY;
878 } else 878 } else
879 ret = nilfs_cpfile_clear_snapshot(cpfile, cno); 879 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
880 up(&nilfs->ns_bdev->bd_mount_sem); 880 mutex_unlock(&nilfs->ns_mount_mutex);
881 return ret; 881 return ret;
882 case NILFS_SNAPSHOT: 882 case NILFS_SNAPSHOT:
883 return nilfs_cpfile_set_snapshot(cpfile, cno); 883 return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index adccd4fc654e..0776ccc2504a 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -60,6 +60,7 @@ struct nilfs_sb_info {
60 struct super_block *s_super; /* reverse pointer to super_block */ 60 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs; 61 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */ 62 struct list_head s_list; /* list head for nilfs->ns_supers */
63 atomic_t s_count; /* reference count */
63 64
64 /* Segment constructor */ 65 /* Segment constructor */
65 struct list_head s_dirty_files; /* dirty files list */ 66 struct list_head s_dirty_files; /* dirty files list */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6989b03e97ab..1777a3467bd2 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -65,9 +65,8 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
65 "(NILFS)"); 65 "(NILFS)");
66MODULE_LICENSE("GPL"); 66MODULE_LICENSE("GPL");
67 67
68static void nilfs_write_super(struct super_block *sb);
68static int nilfs_remount(struct super_block *sb, int *flags, char *data); 69static int nilfs_remount(struct super_block *sb, int *flags, char *data);
69static int test_exclusive_mount(struct file_system_type *fs_type,
70 struct block_device *bdev, int flags);
71 70
72/** 71/**
73 * nilfs_error() - report failure condition on a filesystem 72 * nilfs_error() - report failure condition on a filesystem
@@ -315,6 +314,11 @@ static void nilfs_put_super(struct super_block *sb)
315 struct nilfs_sb_info *sbi = NILFS_SB(sb); 314 struct nilfs_sb_info *sbi = NILFS_SB(sb);
316 struct the_nilfs *nilfs = sbi->s_nilfs; 315 struct the_nilfs *nilfs = sbi->s_nilfs;
317 316
317 lock_kernel();
318
319 if (sb->s_dirt)
320 nilfs_write_super(sb);
321
318 nilfs_detach_segment_constructor(sbi); 322 nilfs_detach_segment_constructor(sbi);
319 323
320 if (!(sb->s_flags & MS_RDONLY)) { 324 if (!(sb->s_flags & MS_RDONLY)) {
@@ -323,12 +327,18 @@ static void nilfs_put_super(struct super_block *sb)
323 nilfs_commit_super(sbi, 1); 327 nilfs_commit_super(sbi, 1);
324 up_write(&nilfs->ns_sem); 328 up_write(&nilfs->ns_sem);
325 } 329 }
330 down_write(&nilfs->ns_super_sem);
331 if (nilfs->ns_current == sbi)
332 nilfs->ns_current = NULL;
333 up_write(&nilfs->ns_super_sem);
326 334
327 nilfs_detach_checkpoint(sbi); 335 nilfs_detach_checkpoint(sbi);
328 put_nilfs(sbi->s_nilfs); 336 put_nilfs(sbi->s_nilfs);
329 sbi->s_super = NULL; 337 sbi->s_super = NULL;
330 sb->s_fs_info = NULL; 338 sb->s_fs_info = NULL;
331 kfree(sbi); 339 nilfs_put_sbinfo(sbi);
340
341 unlock_kernel();
332} 342}
333 343
334/** 344/**
@@ -383,6 +393,8 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
383{ 393{
384 int err = 0; 394 int err = 0;
385 395
396 nilfs_write_super(sb);
397
386 /* This function is called when super block should be written back */ 398 /* This function is called when super block should be written back */
387 if (wait) 399 if (wait)
388 err = nilfs_construct_segment(sb); 400 err = nilfs_construct_segment(sb);
@@ -396,9 +408,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
396 struct buffer_head *bh_cp; 408 struct buffer_head *bh_cp;
397 int err; 409 int err;
398 410
399 down_write(&nilfs->ns_sem); 411 down_write(&nilfs->ns_super_sem);
400 list_add(&sbi->s_list, &nilfs->ns_supers); 412 list_add(&sbi->s_list, &nilfs->ns_supers);
401 up_write(&nilfs->ns_sem); 413 up_write(&nilfs->ns_super_sem);
402 414
403 sbi->s_ifile = nilfs_mdt_new( 415 sbi->s_ifile = nilfs_mdt_new(
404 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); 416 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
@@ -436,9 +448,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
436 nilfs_mdt_destroy(sbi->s_ifile); 448 nilfs_mdt_destroy(sbi->s_ifile);
437 sbi->s_ifile = NULL; 449 sbi->s_ifile = NULL;
438 450
439 down_write(&nilfs->ns_sem); 451 down_write(&nilfs->ns_super_sem);
440 list_del_init(&sbi->s_list); 452 list_del_init(&sbi->s_list);
441 up_write(&nilfs->ns_sem); 453 up_write(&nilfs->ns_super_sem);
442 454
443 return err; 455 return err;
444} 456}
@@ -450,9 +462,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
450 nilfs_mdt_clear(sbi->s_ifile); 462 nilfs_mdt_clear(sbi->s_ifile);
451 nilfs_mdt_destroy(sbi->s_ifile); 463 nilfs_mdt_destroy(sbi->s_ifile);
452 sbi->s_ifile = NULL; 464 sbi->s_ifile = NULL;
453 down_write(&nilfs->ns_sem); 465 down_write(&nilfs->ns_super_sem);
454 list_del_init(&sbi->s_list); 466 list_del_init(&sbi->s_list);
455 up_write(&nilfs->ns_sem); 467 up_write(&nilfs->ns_super_sem);
456} 468}
457 469
458static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi) 470static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
@@ -752,7 +764,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
752 * @silent: silent mode flag 764 * @silent: silent mode flag
753 * @nilfs: the_nilfs struct 765 * @nilfs: the_nilfs struct
754 * 766 *
755 * This function is called exclusively by bd_mount_mutex. 767 * This function is called exclusively by nilfs->ns_mount_mutex.
756 * So, the recovery process is protected from other simultaneous mounts. 768 * So, the recovery process is protected from other simultaneous mounts.
757 */ 769 */
758static int 770static int
@@ -773,6 +785,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
773 get_nilfs(nilfs); 785 get_nilfs(nilfs);
774 sbi->s_nilfs = nilfs; 786 sbi->s_nilfs = nilfs;
775 sbi->s_super = sb; 787 sbi->s_super = sb;
788 atomic_set(&sbi->s_count, 1);
776 789
777 err = init_nilfs(nilfs, sbi, (char *)data); 790 err = init_nilfs(nilfs, sbi, (char *)data);
778 if (err) 791 if (err)
@@ -870,6 +883,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
870 goto failed_root; 883 goto failed_root;
871 } 884 }
872 885
886 down_write(&nilfs->ns_super_sem);
887 if (!nilfs_test_opt(sbi, SNAPSHOT))
888 nilfs->ns_current = sbi;
889 up_write(&nilfs->ns_super_sem);
890
873 return 0; 891 return 0;
874 892
875 failed_root: 893 failed_root:
@@ -885,7 +903,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
885 failed_sbi: 903 failed_sbi:
886 put_nilfs(nilfs); 904 put_nilfs(nilfs);
887 sb->s_fs_info = NULL; 905 sb->s_fs_info = NULL;
888 kfree(sbi); 906 nilfs_put_sbinfo(sbi);
889 return err; 907 return err;
890} 908}
891 909
@@ -898,6 +916,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
898 struct nilfs_mount_options old_opts; 916 struct nilfs_mount_options old_opts;
899 int err; 917 int err;
900 918
919 lock_kernel();
920
921 down_write(&nilfs->ns_super_sem);
901 old_sb_flags = sb->s_flags; 922 old_sb_flags = sb->s_flags;
902 old_opts.mount_opt = sbi->s_mount_opt; 923 old_opts.mount_opt = sbi->s_mount_opt;
903 old_opts.snapshot_cno = sbi->s_snapshot_cno; 924 old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -945,14 +966,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
945 * store the current valid flag. (It may have been changed 966 * store the current valid flag. (It may have been changed
946 * by fsck since we originally mounted the partition.) 967 * by fsck since we originally mounted the partition.)
947 */ 968 */
948 down(&sb->s_bdev->bd_mount_sem); 969 if (nilfs->ns_current && nilfs->ns_current != sbi) {
949 /* Check existing RW-mount */
950 if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
951 printk(KERN_WARNING "NILFS (device %s): couldn't " 970 printk(KERN_WARNING "NILFS (device %s): couldn't "
952 "remount because a RW-mount exists.\n", 971 "remount because an RW-mount exists.\n",
953 sb->s_id); 972 sb->s_id);
954 err = -EBUSY; 973 err = -EBUSY;
955 goto rw_remount_failed; 974 goto restore_opts;
956 } 975 }
957 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { 976 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
958 printk(KERN_WARNING "NILFS (device %s): couldn't " 977 printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -960,7 +979,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
960 "the latest one.\n", 979 "the latest one.\n",
961 sb->s_id); 980 sb->s_id);
962 err = -EINVAL; 981 err = -EINVAL;
963 goto rw_remount_failed; 982 goto restore_opts;
964 } 983 }
965 sb->s_flags &= ~MS_RDONLY; 984 sb->s_flags &= ~MS_RDONLY;
966 nilfs_clear_opt(sbi, SNAPSHOT); 985 nilfs_clear_opt(sbi, SNAPSHOT);
@@ -968,28 +987,31 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
968 987
969 err = nilfs_attach_segment_constructor(sbi); 988 err = nilfs_attach_segment_constructor(sbi);
970 if (err) 989 if (err)
971 goto rw_remount_failed; 990 goto restore_opts;
972 991
973 down_write(&nilfs->ns_sem); 992 down_write(&nilfs->ns_sem);
974 nilfs_setup_super(sbi); 993 nilfs_setup_super(sbi);
975 up_write(&nilfs->ns_sem); 994 up_write(&nilfs->ns_sem);
976 995
977 up(&sb->s_bdev->bd_mount_sem); 996 nilfs->ns_current = sbi;
978 } 997 }
979 out: 998 out:
999 up_write(&nilfs->ns_super_sem);
1000 unlock_kernel();
980 return 0; 1001 return 0;
981 1002
982 rw_remount_failed:
983 up(&sb->s_bdev->bd_mount_sem);
984 restore_opts: 1003 restore_opts:
985 sb->s_flags = old_sb_flags; 1004 sb->s_flags = old_sb_flags;
986 sbi->s_mount_opt = old_opts.mount_opt; 1005 sbi->s_mount_opt = old_opts.mount_opt;
987 sbi->s_snapshot_cno = old_opts.snapshot_cno; 1006 sbi->s_snapshot_cno = old_opts.snapshot_cno;
1007 up_write(&nilfs->ns_super_sem);
1008 unlock_kernel();
988 return err; 1009 return err;
989} 1010}
990 1011
991struct nilfs_super_data { 1012struct nilfs_super_data {
992 struct block_device *bdev; 1013 struct block_device *bdev;
1014 struct nilfs_sb_info *sbi;
993 __u64 cno; 1015 __u64 cno;
994 int flags; 1016 int flags;
995}; 1017};
@@ -1048,33 +1070,7 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data)
1048{ 1070{
1049 struct nilfs_super_data *sd = data; 1071 struct nilfs_super_data *sd = data;
1050 1072
1051 return s->s_bdev == sd->bdev; 1073 return sd->sbi && s->s_fs_info == (void *)sd->sbi;
1052}
1053
1054static int nilfs_test_bdev_super2(struct super_block *s, void *data)
1055{
1056 struct nilfs_super_data *sd = data;
1057 int ret;
1058
1059 if (s->s_bdev != sd->bdev)
1060 return 0;
1061
1062 if (!((s->s_flags | sd->flags) & MS_RDONLY))
1063 return 1; /* Reuse an old R/W-mode super_block */
1064
1065 if (s->s_flags & sd->flags & MS_RDONLY) {
1066 if (down_read_trylock(&s->s_umount)) {
1067 ret = s->s_root &&
1068 (sd->cno == NILFS_SB(s)->s_snapshot_cno);
1069 up_read(&s->s_umount);
1070 /*
1071 * This path is locked with sb_lock by sget().
1072 * So, drop_super() causes deadlock.
1073 */
1074 return ret;
1075 }
1076 }
1077 return 0;
1078} 1074}
1079 1075
1080static int 1076static int
@@ -1082,8 +1078,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1082 const char *dev_name, void *data, struct vfsmount *mnt) 1078 const char *dev_name, void *data, struct vfsmount *mnt)
1083{ 1079{
1084 struct nilfs_super_data sd; 1080 struct nilfs_super_data sd;
1085 struct super_block *s, *s2; 1081 struct super_block *s;
1086 struct the_nilfs *nilfs = NULL; 1082 struct the_nilfs *nilfs;
1087 int err, need_to_close = 1; 1083 int err, need_to_close = 1;
1088 1084
1089 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); 1085 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
@@ -1095,7 +1091,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1095 * much more information than normal filesystems to identify mount 1091 * much more information than normal filesystems to identify mount
1096 * instance. For snapshot mounts, not only a mount type (ro-mount 1092 * instance. For snapshot mounts, not only a mount type (ro-mount
1097 * or rw-mount) but also a checkpoint number is required. 1093 * or rw-mount) but also a checkpoint number is required.
1098 * The results are passed in sget() using nilfs_super_data.
1099 */ 1094 */
1100 sd.cno = 0; 1095 sd.cno = 0;
1101 sd.flags = flags; 1096 sd.flags = flags;
@@ -1104,64 +1099,59 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1104 goto failed; 1099 goto failed;
1105 } 1100 }
1106 1101
1107 /* 1102 nilfs = find_or_create_nilfs(sd.bdev);
1108 * once the super is inserted into the list by sget, s_umount 1103 if (!nilfs) {
1109 * will protect the lockfs code from trying to start a snapshot 1104 err = -ENOMEM;
1110 * while we are mounting 1105 goto failed;
1111 */
1112 down(&sd.bdev->bd_mount_sem);
1113 if (!sd.cno &&
1114 (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
1115 err = (err < 0) ? : -EBUSY;
1116 goto failed_unlock;
1117 } 1106 }
1118 1107
1119 /* 1108 mutex_lock(&nilfs->ns_mount_mutex);
1120 * Phase-1: search any existent instance and get the_nilfs
1121 */
1122 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
1123 if (IS_ERR(s))
1124 goto error_s;
1125
1126 if (!s->s_root) {
1127 err = -ENOMEM;
1128 nilfs = alloc_nilfs(sd.bdev);
1129 if (!nilfs)
1130 goto cancel_new;
1131 } else {
1132 struct nilfs_sb_info *sbi = NILFS_SB(s);
1133 1109
1110 if (!sd.cno) {
1134 /* 1111 /*
1135 * s_umount protects super_block from unmount process; 1112 * Check if an exclusive mount exists or not.
1136 * It covers pointers of nilfs_sb_info and the_nilfs. 1113 * Snapshot mounts coexist with a current mount
1114 * (i.e. rw-mount or ro-mount), whereas rw-mount and
1115 * ro-mount are mutually exclusive.
1137 */ 1116 */
1138 nilfs = sbi->s_nilfs; 1117 down_read(&nilfs->ns_super_sem);
1139 get_nilfs(nilfs); 1118 if (nilfs->ns_current &&
1140 up_write(&s->s_umount); 1119 ((nilfs->ns_current->s_super->s_flags ^ flags)
1120 & MS_RDONLY)) {
1121 up_read(&nilfs->ns_super_sem);
1122 err = -EBUSY;
1123 goto failed_unlock;
1124 }
1125 up_read(&nilfs->ns_super_sem);
1126 }
1141 1127
1142 /* 1128 /*
1143 * Phase-2: search specified snapshot or R/W mode super_block 1129 * Find existing nilfs_sb_info struct
1144 */ 1130 */
1145 if (!sd.cno) 1131 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
1146 /* trying to get the latest checkpoint. */
1147 sd.cno = nilfs_last_cno(nilfs);
1148 1132
1149 s2 = sget(fs_type, nilfs_test_bdev_super2, 1133 if (!sd.cno)
1150 nilfs_set_bdev_super, &sd); 1134 /* trying to get the latest checkpoint. */
1151 deactivate_super(s); 1135 sd.cno = nilfs_last_cno(nilfs);
1152 /* 1136
1153 * Although deactivate_super() invokes close_bdev_exclusive() at 1137 /*
1154 * kill_block_super(). Here, s is an existent mount; we need 1138 * Get super block instance holding the nilfs_sb_info struct.
1155 * one more close_bdev_exclusive() call. 1139 * A new instance is allocated if no existing mount is present or
1156 */ 1140 * existing instance has been unmounted.
1157 s = s2; 1141 */
1158 if (IS_ERR(s)) 1142 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
1159 goto error_s; 1143 if (sd.sbi)
1144 nilfs_put_sbinfo(sd.sbi);
1145
1146 if (IS_ERR(s)) {
1147 err = PTR_ERR(s);
1148 goto failed_unlock;
1160 } 1149 }
1161 1150
1162 if (!s->s_root) { 1151 if (!s->s_root) {
1163 char b[BDEVNAME_SIZE]; 1152 char b[BDEVNAME_SIZE];
1164 1153
1154 /* New superblock instance created */
1165 s->s_flags = flags; 1155 s->s_flags = flags;
1166 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1156 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1167 sb_set_blocksize(s, block_size(sd.bdev)); 1157 sb_set_blocksize(s, block_size(sd.bdev));
@@ -1172,26 +1162,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1172 1162
1173 s->s_flags |= MS_ACTIVE; 1163 s->s_flags |= MS_ACTIVE;
1174 need_to_close = 0; 1164 need_to_close = 0;
1175 } else if (!(s->s_flags & MS_RDONLY)) {
1176 err = -EBUSY;
1177 } 1165 }
1178 1166
1179 up(&sd.bdev->bd_mount_sem); 1167 mutex_unlock(&nilfs->ns_mount_mutex);
1180 put_nilfs(nilfs); 1168 put_nilfs(nilfs);
1181 if (need_to_close) 1169 if (need_to_close)
1182 close_bdev_exclusive(sd.bdev, flags); 1170 close_bdev_exclusive(sd.bdev, flags);
1183 simple_set_mnt(mnt, s); 1171 simple_set_mnt(mnt, s);
1184 return 0; 1172 return 0;
1185 1173
1186 error_s:
1187 up(&sd.bdev->bd_mount_sem);
1188 if (nilfs)
1189 put_nilfs(nilfs);
1190 close_bdev_exclusive(sd.bdev, flags);
1191 return PTR_ERR(s);
1192
1193 failed_unlock: 1174 failed_unlock:
1194 up(&sd.bdev->bd_mount_sem); 1175 mutex_unlock(&nilfs->ns_mount_mutex);
1176 put_nilfs(nilfs);
1195 failed: 1177 failed:
1196 close_bdev_exclusive(sd.bdev, flags); 1178 close_bdev_exclusive(sd.bdev, flags);
1197 1179
@@ -1199,70 +1181,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1199 1181
1200 cancel_new: 1182 cancel_new:
1201 /* Abandoning the newly allocated superblock */ 1183 /* Abandoning the newly allocated superblock */
1202 up(&sd.bdev->bd_mount_sem); 1184 mutex_unlock(&nilfs->ns_mount_mutex);
1203 if (nilfs) 1185 put_nilfs(nilfs);
1204 put_nilfs(nilfs);
1205 up_write(&s->s_umount); 1186 up_write(&s->s_umount);
1206 deactivate_super(s); 1187 deactivate_super(s);
1207 /* 1188 /*
1208 * deactivate_super() invokes close_bdev_exclusive(). 1189 * deactivate_super() invokes close_bdev_exclusive().
1209 * We must finish all post-cleaning before this call; 1190 * We must finish all post-cleaning before this call;
1210 * put_nilfs() and unlocking bd_mount_sem need the block device. 1191 * put_nilfs() needs the block device.
1211 */ 1192 */
1212 return err; 1193 return err;
1213} 1194}
1214 1195
1215static int nilfs_test_bdev_super3(struct super_block *s, void *data)
1216{
1217 struct nilfs_super_data *sd = data;
1218 int ret;
1219
1220 if (s->s_bdev != sd->bdev)
1221 return 0;
1222 if (down_read_trylock(&s->s_umount)) {
1223 ret = (s->s_flags & MS_RDONLY) && s->s_root &&
1224 nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
1225 up_read(&s->s_umount);
1226 if (ret)
1227 return 0; /* ignore snapshot mounts */
1228 }
1229 return !((sd->flags ^ s->s_flags) & MS_RDONLY);
1230}
1231
1232static int __false_bdev_super(struct super_block *s, void *data)
1233{
1234#if 0 /* XXX: workaround for lock debug. This is not good idea */
1235 up_write(&s->s_umount);
1236#endif
1237 return -EFAULT;
1238}
1239
1240/**
1241 * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
1242 * fs_type: filesystem type
1243 * bdev: block device
1244 * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
1245 * res: pointer to an integer to store result
1246 *
1247 * This function must be called within a section protected by bd_mount_mutex.
1248 */
1249static int test_exclusive_mount(struct file_system_type *fs_type,
1250 struct block_device *bdev, int flags)
1251{
1252 struct super_block *s;
1253 struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
1254
1255 s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
1256 if (IS_ERR(s)) {
1257 if (PTR_ERR(s) != -EFAULT)
1258 return PTR_ERR(s);
1259 return 0; /* Not found */
1260 }
1261 up_write(&s->s_umount);
1262 deactivate_super(s);
1263 return 1; /* Found */
1264}
1265
1266struct file_system_type nilfs_fs_type = { 1196struct file_system_type nilfs_fs_type = {
1267 .owner = THIS_MODULE, 1197 .owner = THIS_MODULE,
1268 .name = "nilfs2", 1198 .name = "nilfs2",
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3be4aa9..e4e5c78bcc93 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,6 +35,10 @@
35#include "seglist.h" 35#include "seglist.h"
36#include "segbuf.h" 36#include "segbuf.h"
37 37
38
39static LIST_HEAD(nilfs_objects);
40static DEFINE_SPINLOCK(nilfs_lock);
41
38void nilfs_set_last_segment(struct the_nilfs *nilfs, 42void nilfs_set_last_segment(struct the_nilfs *nilfs,
39 sector_t start_blocknr, u64 seq, __u64 cno) 43 sector_t start_blocknr, u64 seq, __u64 cno)
40{ 44{
@@ -55,7 +59,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
55 * Return Value: On success, pointer to the_nilfs is returned. 59 * Return Value: On success, pointer to the_nilfs is returned.
56 * On error, NULL is returned. 60 * On error, NULL is returned.
57 */ 61 */
58struct the_nilfs *alloc_nilfs(struct block_device *bdev) 62static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
59{ 63{
60 struct the_nilfs *nilfs; 64 struct the_nilfs *nilfs;
61 65
@@ -68,7 +72,10 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
68 atomic_set(&nilfs->ns_writer_refcount, -1); 72 atomic_set(&nilfs->ns_writer_refcount, -1);
69 atomic_set(&nilfs->ns_ndirtyblks, 0); 73 atomic_set(&nilfs->ns_ndirtyblks, 0);
70 init_rwsem(&nilfs->ns_sem); 74 init_rwsem(&nilfs->ns_sem);
75 init_rwsem(&nilfs->ns_super_sem);
76 mutex_init(&nilfs->ns_mount_mutex);
71 mutex_init(&nilfs->ns_writer_mutex); 77 mutex_init(&nilfs->ns_writer_mutex);
78 INIT_LIST_HEAD(&nilfs->ns_list);
72 INIT_LIST_HEAD(&nilfs->ns_supers); 79 INIT_LIST_HEAD(&nilfs->ns_supers);
73 spin_lock_init(&nilfs->ns_last_segment_lock); 80 spin_lock_init(&nilfs->ns_last_segment_lock);
74 nilfs->ns_gc_inodes_h = NULL; 81 nilfs->ns_gc_inodes_h = NULL;
@@ -78,6 +85,45 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
78} 85}
79 86
80/** 87/**
88 * find_or_create_nilfs - find or create nilfs object
89 * @bdev: block device to which the_nilfs is related
90 *
91 * find_nilfs() looks up an existent nilfs object created on the
92 * device and gets the reference count of the object. If no nilfs object
93 * is found on the device, a new nilfs object is allocated.
94 *
95 * Return Value: On success, pointer to the nilfs object is returned.
96 * On error, NULL is returned.
97 */
98struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
99{
100 struct the_nilfs *nilfs, *new = NULL;
101
102 retry:
103 spin_lock(&nilfs_lock);
104 list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
105 if (nilfs->ns_bdev == bdev) {
106 get_nilfs(nilfs);
107 spin_unlock(&nilfs_lock);
108 if (new)
109 put_nilfs(new);
110 return nilfs; /* existing object */
111 }
112 }
113 if (new) {
114 list_add_tail(&new->ns_list, &nilfs_objects);
115 spin_unlock(&nilfs_lock);
116 return new; /* new object */
117 }
118 spin_unlock(&nilfs_lock);
119
120 new = alloc_nilfs(bdev);
121 if (new)
122 goto retry;
123 return NULL; /* insufficient memory */
124}
125
126/**
81 * put_nilfs - release a reference to the_nilfs 127 * put_nilfs - release a reference to the_nilfs
82 * @nilfs: the_nilfs structure to be released 128 * @nilfs: the_nilfs structure to be released
83 * 129 *
@@ -86,13 +132,20 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
86 */ 132 */
87void put_nilfs(struct the_nilfs *nilfs) 133void put_nilfs(struct the_nilfs *nilfs)
88{ 134{
89 if (!atomic_dec_and_test(&nilfs->ns_count)) 135 spin_lock(&nilfs_lock);
136 if (!atomic_dec_and_test(&nilfs->ns_count)) {
137 spin_unlock(&nilfs_lock);
90 return; 138 return;
139 }
140 list_del_init(&nilfs->ns_list);
141 spin_unlock(&nilfs_lock);
142
91 /* 143 /*
92 * Increment of ns_count never occur below because the caller 144 * Increment of ns_count never occurs below because the caller
93 * of get_nilfs() holds at least one reference to the_nilfs. 145 * of get_nilfs() holds at least one reference to the_nilfs.
94 * Thus its exclusion control is not required here. 146 * Thus its exclusion control is not required here.
95 */ 147 */
148
96 might_sleep(); 149 might_sleep();
97 if (nilfs_loaded(nilfs)) { 150 if (nilfs_loaded(nilfs)) {
98 nilfs_mdt_clear(nilfs->ns_sufile); 151 nilfs_mdt_clear(nilfs->ns_sufile);
@@ -515,7 +568,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
515 568
516 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); 569 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
517 if (sb->s_blocksize != blocksize) { 570 if (sb->s_blocksize != blocksize) {
518 int hw_blocksize = bdev_hardsect_size(sb->s_bdev); 571 int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
519 572
520 if (blocksize < hw_blocksize) { 573 if (blocksize < hw_blocksize) {
521 printk(KERN_ERR 574 printk(KERN_ERR
@@ -613,13 +666,63 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
613 return ret; 666 return ret;
614} 667}
615 668
669/**
670 * nilfs_find_sbinfo - find existing nilfs_sb_info structure
671 * @nilfs: nilfs object
672 * @rw_mount: mount type (non-zero value for read/write mount)
673 * @cno: checkpoint number (zero for read-only mount)
674 *
675 * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
676 * @rw_mount and @cno (in case of snapshots) matched. If no instance
677 * was found, NULL is returned. Although the super block instance can
678 * be unmounted after this function returns, the nilfs_sb_info struct
679 * is kept on memory until nilfs_put_sbinfo() is called.
680 */
681struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
682 int rw_mount, __u64 cno)
683{
684 struct nilfs_sb_info *sbi;
685
686 down_read(&nilfs->ns_super_sem);
687 /*
688 * The SNAPSHOT flag and sb->s_flags are supposed to be
689 * protected with nilfs->ns_super_sem.
690 */
691 sbi = nilfs->ns_current;
692 if (rw_mount) {
693 if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
694 goto found; /* read/write mount */
695 else
696 goto out;
697 } else if (cno == 0) {
698 if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
699 goto found; /* read-only mount */
700 else
701 goto out;
702 }
703
704 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
705 if (nilfs_test_opt(sbi, SNAPSHOT) &&
706 sbi->s_snapshot_cno == cno)
707 goto found; /* snapshot mount */
708 }
709 out:
710 up_read(&nilfs->ns_super_sem);
711 return NULL;
712
713 found:
714 atomic_inc(&sbi->s_count);
715 up_read(&nilfs->ns_super_sem);
716 return sbi;
717}
718
616int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, 719int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
617 int snapshot_mount) 720 int snapshot_mount)
618{ 721{
619 struct nilfs_sb_info *sbi; 722 struct nilfs_sb_info *sbi;
620 int ret = 0; 723 int ret = 0;
621 724
622 down_read(&nilfs->ns_sem); 725 down_read(&nilfs->ns_super_sem);
623 if (cno == 0 || cno > nilfs->ns_cno) 726 if (cno == 0 || cno > nilfs->ns_cno)
624 goto out_unlock; 727 goto out_unlock;
625 728
@@ -636,6 +739,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
636 ret++; 739 ret++;
637 740
638 out_unlock: 741 out_unlock:
639 up_read(&nilfs->ns_sem); 742 up_read(&nilfs->ns_super_sem);
640 return ret; 743 return ret;
641} 744}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 30fe58778d05..e8adbffc626f 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -43,12 +43,16 @@ enum {
43 * struct the_nilfs - struct to supervise multiple nilfs mount points 43 * struct the_nilfs - struct to supervise multiple nilfs mount points
44 * @ns_flags: flags 44 * @ns_flags: flags
45 * @ns_count: reference count 45 * @ns_count: reference count
46 * @ns_list: list head for nilfs_list
46 * @ns_bdev: block device 47 * @ns_bdev: block device
47 * @ns_bdi: backing dev info 48 * @ns_bdi: backing dev info
48 * @ns_writer: back pointer to writable nilfs_sb_info 49 * @ns_writer: back pointer to writable nilfs_sb_info
49 * @ns_sem: semaphore for shared states 50 * @ns_sem: semaphore for shared states
51 * @ns_super_sem: semaphore for global operations across super block instances
52 * @ns_mount_mutex: mutex protecting mount process of nilfs
50 * @ns_writer_mutex: mutex protecting ns_writer attach/detach 53 * @ns_writer_mutex: mutex protecting ns_writer attach/detach
51 * @ns_writer_refcount: number of referrers on ns_writer 54 * @ns_writer_refcount: number of referrers on ns_writer
55 * @ns_current: back pointer to current mount
52 * @ns_sbh: buffer heads of on-disk super blocks 56 * @ns_sbh: buffer heads of on-disk super blocks
53 * @ns_sbp: pointers to super block data 57 * @ns_sbp: pointers to super block data
54 * @ns_sbwtime: previous write time of super blocks 58 * @ns_sbwtime: previous write time of super blocks
@@ -88,15 +92,24 @@ enum {
88struct the_nilfs { 92struct the_nilfs {
89 unsigned long ns_flags; 93 unsigned long ns_flags;
90 atomic_t ns_count; 94 atomic_t ns_count;
95 struct list_head ns_list;
91 96
92 struct block_device *ns_bdev; 97 struct block_device *ns_bdev;
93 struct backing_dev_info *ns_bdi; 98 struct backing_dev_info *ns_bdi;
94 struct nilfs_sb_info *ns_writer; 99 struct nilfs_sb_info *ns_writer;
95 struct rw_semaphore ns_sem; 100 struct rw_semaphore ns_sem;
101 struct rw_semaphore ns_super_sem;
102 struct mutex ns_mount_mutex;
96 struct mutex ns_writer_mutex; 103 struct mutex ns_writer_mutex;
97 atomic_t ns_writer_refcount; 104 atomic_t ns_writer_refcount;
98 105
99 /* 106 /*
107 * components protected by ns_super_sem
108 */
109 struct nilfs_sb_info *ns_current;
110 struct list_head ns_supers;
111
112 /*
100 * used for 113 * used for
101 * - loading the latest checkpoint exclusively. 114 * - loading the latest checkpoint exclusively.
102 * - allocating a new full segment. 115 * - allocating a new full segment.
@@ -108,7 +121,6 @@ struct the_nilfs {
108 time_t ns_sbwtime[2]; 121 time_t ns_sbwtime[2];
109 unsigned ns_sbsize; 122 unsigned ns_sbsize;
110 unsigned ns_mount_state; 123 unsigned ns_mount_state;
111 struct list_head ns_supers;
112 124
113 /* 125 /*
114 * Following fields are dedicated to a writable FS-instance. 126 * Following fields are dedicated to a writable FS-instance.
@@ -191,11 +203,12 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
191#define NILFS_ALTSB_FREQ 60 /* spare superblock */ 203#define NILFS_ALTSB_FREQ 60 /* spare superblock */
192 204
193void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 205void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
194struct the_nilfs *alloc_nilfs(struct block_device *); 206struct the_nilfs *find_or_create_nilfs(struct block_device *);
195void put_nilfs(struct the_nilfs *); 207void put_nilfs(struct the_nilfs *);
196int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 208int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
197int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 209int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
198int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 210int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
211struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
199int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); 212int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
200int nilfs_near_disk_full(struct the_nilfs *); 213int nilfs_near_disk_full(struct the_nilfs *);
201void nilfs_fall_back_super_block(struct the_nilfs *); 214void nilfs_fall_back_super_block(struct the_nilfs *);
@@ -238,6 +251,12 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
238 mutex_unlock(&nilfs->ns_writer_mutex); 251 mutex_unlock(&nilfs->ns_writer_mutex);
239} 252}
240 253
254static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
255{
256 if (!atomic_dec_and_test(&sbi->s_count))
257 kfree(sbi);
258}
259
241static inline void 260static inline void
242nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, 261nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
243 sector_t *seg_start, sector_t *seg_end) 262 sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 50914d7303c6..31dac7e3b0f1 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,2 +1,15 @@
1config FSNOTIFY
2 bool "Filesystem notification backend"
3 default y
4 ---help---
5 fsnotify is a backend for filesystem notification. fsnotify does
6 not provide any userspace interface but does provide the basis
7 needed for other notification schemes such as dnotify, inotify,
8 and fanotify.
9
10 Say Y here to enable fsnotify suport.
11
12 If unsure, say Y.
13
1source "fs/notify/dnotify/Kconfig" 14source "fs/notify/dnotify/Kconfig"
2source "fs/notify/inotify/Kconfig" 15source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 5a95b6010ce7..0922cc826c46 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,2 +1,4 @@
1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o
2
1obj-y += dnotify/ 3obj-y += dnotify/
2obj-y += inotify/ 4obj-y += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 26adf5dfa646..904ff8d5405a 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,5 +1,6 @@
1config DNOTIFY 1config DNOTIFY
2 bool "Dnotify support" 2 bool "Dnotify support"
3 depends on FSNOTIFY
3 default y 4 default y
4 help 5 help
5 Dnotify is a directory-based per-fd file change notification system 6 Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b0aa2cde80bd..828a889be909 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -3,6 +3,9 @@
3 * 3 *
4 * Copyright (C) 2000,2001,2002 Stephen Rothwell 4 * Copyright (C) 2000,2001,2002 Stephen Rothwell
5 * 5 *
6 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
7 * dnotify was largly rewritten to use the new fsnotify infrastructure
8 *
6 * This program is free software; you can redistribute it and/or modify it 9 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the 10 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any 11 * Free Software Foundation; either version 2, or (at your option) any
@@ -21,24 +24,173 @@
21#include <linux/spinlock.h> 24#include <linux/spinlock.h>
22#include <linux/slab.h> 25#include <linux/slab.h>
23#include <linux/fdtable.h> 26#include <linux/fdtable.h>
27#include <linux/fsnotify_backend.h>
24 28
25int dir_notify_enable __read_mostly = 1; 29int dir_notify_enable __read_mostly = 1;
26 30
27static struct kmem_cache *dn_cache __read_mostly; 31static struct kmem_cache *dnotify_struct_cache __read_mostly;
32static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
33static struct fsnotify_group *dnotify_group __read_mostly;
34static DEFINE_MUTEX(dnotify_mark_mutex);
35
36/*
37 * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
38 * is being watched by dnotify. If multiple userspace applications are watching
39 * the same directory with dnotify their information is chained in dn
40 */
41struct dnotify_mark_entry {
42 struct fsnotify_mark_entry fsn_entry;
43 struct dnotify_struct *dn;
44};
28 45
29static void redo_inode_mask(struct inode *inode) 46/*
47 * When a process starts or stops watching an inode the set of events which
48 * dnotify cares about for that inode may change. This function runs the
49 * list of everything receiving dnotify events about this directory and calculates
50 * the set of all those events. After it updates what dnotify is interested in
51 * it calls the fsnotify function so it can update the set of all events relevant
52 * to this inode.
53 */
54static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
30{ 55{
31 unsigned long new_mask; 56 __u32 new_mask, old_mask;
32 struct dnotify_struct *dn; 57 struct dnotify_struct *dn;
58 struct dnotify_mark_entry *dnentry = container_of(entry,
59 struct dnotify_mark_entry,
60 fsn_entry);
61
62 assert_spin_locked(&entry->lock);
33 63
64 old_mask = entry->mask;
34 new_mask = 0; 65 new_mask = 0;
35 for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next) 66 for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
36 new_mask |= dn->dn_mask & ~DN_MULTISHOT; 67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
37 inode->i_dnotify_mask = new_mask; 68 entry->mask = new_mask;
69
70 if (old_mask == new_mask)
71 return;
72
73 if (entry->inode)
74 fsnotify_recalc_inode_mask(entry->inode);
75}
76
77/*
78 * Mains fsnotify call where events are delivered to dnotify.
79 * Find the dnotify mark on the relevant inode, run the list of dnotify structs
80 * on that mark and determine which of them has expressed interest in receiving
81 * events of this type. When found send the correct process and signal and
82 * destroy the dnotify struct if it was not registered to receive multiple
83 * events.
84 */
85static int dnotify_handle_event(struct fsnotify_group *group,
86 struct fsnotify_event *event)
87{
88 struct fsnotify_mark_entry *entry = NULL;
89 struct dnotify_mark_entry *dnentry;
90 struct inode *to_tell;
91 struct dnotify_struct *dn;
92 struct dnotify_struct **prev;
93 struct fown_struct *fown;
94
95 to_tell = event->to_tell;
96
97 spin_lock(&to_tell->i_lock);
98 entry = fsnotify_find_mark_entry(group, to_tell);
99 spin_unlock(&to_tell->i_lock);
100
101 /* unlikely since we alreay passed dnotify_should_send_event() */
102 if (unlikely(!entry))
103 return 0;
104 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
105
106 spin_lock(&entry->lock);
107 prev = &dnentry->dn;
108 while ((dn = *prev) != NULL) {
109 if ((dn->dn_mask & event->mask) == 0) {
110 prev = &dn->dn_next;
111 continue;
112 }
113 fown = &dn->dn_filp->f_owner;
114 send_sigio(fown, dn->dn_fd, POLL_MSG);
115 if (dn->dn_mask & FS_DN_MULTISHOT)
116 prev = &dn->dn_next;
117 else {
118 *prev = dn->dn_next;
119 kmem_cache_free(dnotify_struct_cache, dn);
120 dnotify_recalc_inode_mask(entry);
121 }
122 }
123
124 spin_unlock(&entry->lock);
125 fsnotify_put_mark(entry);
126
127 return 0;
128}
129
130/*
131 * Given an inode and mask determine if dnotify would be interested in sending
132 * userspace notification for that pair.
133 */
134static bool dnotify_should_send_event(struct fsnotify_group *group,
135 struct inode *inode, __u32 mask)
136{
137 struct fsnotify_mark_entry *entry;
138 bool send;
139
140 /* !dir_notify_enable should never get here, don't waste time checking
141 if (!dir_notify_enable)
142 return 0; */
143
144 /* not a dir, dnotify doesn't care */
145 if (!S_ISDIR(inode->i_mode))
146 return false;
147
148 spin_lock(&inode->i_lock);
149 entry = fsnotify_find_mark_entry(group, inode);
150 spin_unlock(&inode->i_lock);
151
152 /* no mark means no dnotify watch */
153 if (!entry)
154 return false;
155
156 mask = (mask & ~FS_EVENT_ON_CHILD);
157 send = (mask & entry->mask);
158
159 fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
160
161 return send;
162}
163
164static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
165{
166 struct dnotify_mark_entry *dnentry = container_of(entry,
167 struct dnotify_mark_entry,
168 fsn_entry);
169
170 BUG_ON(dnentry->dn);
171
172 kmem_cache_free(dnotify_mark_entry_cache, dnentry);
38} 173}
39 174
175static struct fsnotify_ops dnotify_fsnotify_ops = {
176 .handle_event = dnotify_handle_event,
177 .should_send_event = dnotify_should_send_event,
178 .free_group_priv = NULL,
179 .freeing_mark = NULL,
180 .free_event_priv = NULL,
181};
182
183/*
184 * Called every time a file is closed. Looks first for a dnotify mark on the
185 * inode. If one is found run all of the ->dn entries attached to that
186 * mark for one relevant to this process closing the file and remove that
187 * dnotify_struct. If that was the last dnotify_struct also remove the
188 * fsnotify_mark_entry.
189 */
40void dnotify_flush(struct file *filp, fl_owner_t id) 190void dnotify_flush(struct file *filp, fl_owner_t id)
41{ 191{
192 struct fsnotify_mark_entry *entry;
193 struct dnotify_mark_entry *dnentry;
42 struct dnotify_struct *dn; 194 struct dnotify_struct *dn;
43 struct dnotify_struct **prev; 195 struct dnotify_struct **prev;
44 struct inode *inode; 196 struct inode *inode;
@@ -46,145 +198,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
46 inode = filp->f_path.dentry->d_inode; 198 inode = filp->f_path.dentry->d_inode;
47 if (!S_ISDIR(inode->i_mode)) 199 if (!S_ISDIR(inode->i_mode))
48 return; 200 return;
201
49 spin_lock(&inode->i_lock); 202 spin_lock(&inode->i_lock);
50 prev = &inode->i_dnotify; 203 entry = fsnotify_find_mark_entry(dnotify_group, inode);
204 spin_unlock(&inode->i_lock);
205 if (!entry)
206 return;
207 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
208
209 mutex_lock(&dnotify_mark_mutex);
210
211 spin_lock(&entry->lock);
212 prev = &dnentry->dn;
51 while ((dn = *prev) != NULL) { 213 while ((dn = *prev) != NULL) {
52 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { 214 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
53 *prev = dn->dn_next; 215 *prev = dn->dn_next;
54 redo_inode_mask(inode); 216 kmem_cache_free(dnotify_struct_cache, dn);
55 kmem_cache_free(dn_cache, dn); 217 dnotify_recalc_inode_mask(entry);
56 break; 218 break;
57 } 219 }
58 prev = &dn->dn_next; 220 prev = &dn->dn_next;
59 } 221 }
60 spin_unlock(&inode->i_lock); 222
223 spin_unlock(&entry->lock);
224
225 /* nothing else could have found us thanks to the dnotify_mark_mutex */
226 if (dnentry->dn == NULL)
227 fsnotify_destroy_mark_by_entry(entry);
228
229 fsnotify_recalc_group_mask(dnotify_group);
230
231 mutex_unlock(&dnotify_mark_mutex);
232
233 fsnotify_put_mark(entry);
234}
235
236/* this conversion is done only at watch creation */
237static __u32 convert_arg(unsigned long arg)
238{
239 __u32 new_mask = FS_EVENT_ON_CHILD;
240
241 if (arg & DN_MULTISHOT)
242 new_mask |= FS_DN_MULTISHOT;
243 if (arg & DN_DELETE)
244 new_mask |= (FS_DELETE | FS_MOVED_FROM);
245 if (arg & DN_MODIFY)
246 new_mask |= FS_MODIFY;
247 if (arg & DN_ACCESS)
248 new_mask |= FS_ACCESS;
249 if (arg & DN_ATTRIB)
250 new_mask |= FS_ATTRIB;
251 if (arg & DN_RENAME)
252 new_mask |= FS_DN_RENAME;
253 if (arg & DN_CREATE)
254 new_mask |= (FS_CREATE | FS_MOVED_TO);
255
256 return new_mask;
61} 257}
62 258
259/*
260 * If multiple processes watch the same inode with dnotify there is only one
261 * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
262 * onto that mark. This function either attaches the new dnotify_struct onto
263 * that list, or it |= the mask onto an existing dnofiy_struct.
264 */
265static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
266 fl_owner_t id, int fd, struct file *filp, __u32 mask)
267{
268 struct dnotify_struct *odn;
269
270 odn = dnentry->dn;
271 while (odn != NULL) {
272 /* adding more events to existing dnofiy_struct? */
273 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
274 odn->dn_fd = fd;
275 odn->dn_mask |= mask;
276 return -EEXIST;
277 }
278 odn = odn->dn_next;
279 }
280
281 dn->dn_mask = mask;
282 dn->dn_fd = fd;
283 dn->dn_filp = filp;
284 dn->dn_owner = id;
285 dn->dn_next = dnentry->dn;
286 dnentry->dn = dn;
287
288 return 0;
289}
290
291/*
292 * When a process calls fcntl to attach a dnotify watch to a directory it ends
293 * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be
294 * attached to the fsnotify_mark.
295 */
63int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) 296int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
64{ 297{
298 struct dnotify_mark_entry *new_dnentry, *dnentry;
299 struct fsnotify_mark_entry *new_entry, *entry;
65 struct dnotify_struct *dn; 300 struct dnotify_struct *dn;
66 struct dnotify_struct *odn;
67 struct dnotify_struct **prev;
68 struct inode *inode; 301 struct inode *inode;
69 fl_owner_t id = current->files; 302 fl_owner_t id = current->files;
70 struct file *f; 303 struct file *f;
71 int error = 0; 304 int destroy = 0, error = 0;
305 __u32 mask;
306
307 /* we use these to tell if we need to kfree */
308 new_entry = NULL;
309 dn = NULL;
310
311 if (!dir_notify_enable) {
312 error = -EINVAL;
313 goto out_err;
314 }
72 315
316 /* a 0 mask means we are explicitly removing the watch */
73 if ((arg & ~DN_MULTISHOT) == 0) { 317 if ((arg & ~DN_MULTISHOT) == 0) {
74 dnotify_flush(filp, id); 318 dnotify_flush(filp, id);
75 return 0; 319 error = 0;
320 goto out_err;
76 } 321 }
77 if (!dir_notify_enable) 322
78 return -EINVAL; 323 /* dnotify only works on directories */
79 inode = filp->f_path.dentry->d_inode; 324 inode = filp->f_path.dentry->d_inode;
80 if (!S_ISDIR(inode->i_mode)) 325 if (!S_ISDIR(inode->i_mode)) {
81 return -ENOTDIR; 326 error = -ENOTDIR;
82 dn = kmem_cache_alloc(dn_cache, GFP_KERNEL); 327 goto out_err;
83 if (dn == NULL)
84 return -ENOMEM;
85 spin_lock(&inode->i_lock);
86 prev = &inode->i_dnotify;
87 while ((odn = *prev) != NULL) {
88 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
89 odn->dn_fd = fd;
90 odn->dn_mask |= arg;
91 inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
92 goto out_free;
93 }
94 prev = &odn->dn_next;
95 } 328 }
96 329
97 rcu_read_lock(); 330 /* expect most fcntl to add new rather than augment old */
98 f = fcheck(fd); 331 dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
99 rcu_read_unlock(); 332 if (!dn) {
100 /* we'd lost the race with close(), sod off silently */ 333 error = -ENOMEM;
101 /* note that inode->i_lock prevents reordering problems 334 goto out_err;
102 * between accesses to descriptor table and ->i_dnotify */ 335 }
103 if (f != filp)
104 goto out_free;
105 336
106 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 337 /* new fsnotify mark, we expect most fcntl calls to add a new mark */
107 if (error) 338 new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
108 goto out_free; 339 if (!new_dnentry) {
340 error = -ENOMEM;
341 goto out_err;
342 }
109 343
110 dn->dn_mask = arg; 344 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
111 dn->dn_fd = fd; 345 mask = convert_arg(arg);
112 dn->dn_filp = filp;
113 dn->dn_owner = id;
114 inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
115 dn->dn_next = inode->i_dnotify;
116 inode->i_dnotify = dn;
117 spin_unlock(&inode->i_lock);
118 return 0;
119 346
120out_free: 347 /* set up the new_entry and new_dnentry */
121 spin_unlock(&inode->i_lock); 348 new_entry = &new_dnentry->fsn_entry;
122 kmem_cache_free(dn_cache, dn); 349 fsnotify_init_mark(new_entry, dnotify_free_mark);
123 return error; 350 new_entry->mask = mask;
124} 351 new_dnentry->dn = NULL;
125 352
126void __inode_dir_notify(struct inode *inode, unsigned long event) 353 /* this is needed to prevent the fcntl/close race described below */
127{ 354 mutex_lock(&dnotify_mark_mutex);
128 struct dnotify_struct * dn;
129 struct dnotify_struct **prev;
130 struct fown_struct * fown;
131 int changed = 0;
132 355
356 /* add the new_entry or find an old one. */
133 spin_lock(&inode->i_lock); 357 spin_lock(&inode->i_lock);
134 prev = &inode->i_dnotify; 358 entry = fsnotify_find_mark_entry(dnotify_group, inode);
135 while ((dn = *prev) != NULL) {
136 if ((dn->dn_mask & event) == 0) {
137 prev = &dn->dn_next;
138 continue;
139 }
140 fown = &dn->dn_filp->f_owner;
141 send_sigio(fown, dn->dn_fd, POLL_MSG);
142 if (dn->dn_mask & DN_MULTISHOT)
143 prev = &dn->dn_next;
144 else {
145 *prev = dn->dn_next;
146 changed = 1;
147 kmem_cache_free(dn_cache, dn);
148 }
149 }
150 if (changed)
151 redo_inode_mask(inode);
152 spin_unlock(&inode->i_lock); 359 spin_unlock(&inode->i_lock);
153} 360 if (entry) {
154 361 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
155EXPORT_SYMBOL(__inode_dir_notify); 362 spin_lock(&entry->lock);
363 } else {
364 fsnotify_add_mark(new_entry, dnotify_group, inode);
365 spin_lock(&new_entry->lock);
366 entry = new_entry;
367 dnentry = new_dnentry;
368 /* we used new_entry, so don't free it */
369 new_entry = NULL;
370 }
156 371
157/* 372 rcu_read_lock();
158 * This is hopelessly wrong, but unfixable without API changes. At 373 f = fcheck(fd);
159 * least it doesn't oops the kernel... 374 rcu_read_unlock();
160 *
161 * To safely access ->d_parent we need to keep d_move away from it. Use the
162 * dentry's d_lock for this.
163 */
164void dnotify_parent(struct dentry *dentry, unsigned long event)
165{
166 struct dentry *parent;
167 375
168 if (!dir_notify_enable) 376 /* if (f != filp) means that we lost a race and another task/thread
169 return; 377 * actually closed the fd we are still playing with before we grabbed
378 * the dnotify_mark_mutex and entry->lock. Since closing the fd is the
379 * only time we clean up the mark entries we need to get our mark off
380 * the list. */
381 if (f != filp) {
382 /* if we added ourselves, shoot ourselves, it's possible that
383 * the flush actually did shoot this entry. That's fine too
384 * since multiple calls to destroy_mark is perfectly safe, if
385 * we found a dnentry already attached to the inode, just sod
386 * off silently as the flush at close time dealt with it.
387 */
388 if (dnentry == new_dnentry)
389 destroy = 1;
390 goto out;
391 }
170 392
171 spin_lock(&dentry->d_lock); 393 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
172 parent = dentry->d_parent; 394 if (error) {
173 if (parent->d_inode->i_dnotify_mask & event) { 395 /* if we added, we must shoot */
174 dget(parent); 396 if (dnentry == new_dnentry)
175 spin_unlock(&dentry->d_lock); 397 destroy = 1;
176 __inode_dir_notify(parent->d_inode, event); 398 goto out;
177 dput(parent);
178 } else {
179 spin_unlock(&dentry->d_lock);
180 } 399 }
400
401 error = attach_dn(dn, dnentry, id, fd, filp, mask);
402 /* !error means that we attached the dn to the dnentry, so don't free it */
403 if (!error)
404 dn = NULL;
405 /* -EEXIST means that we didn't add this new dn and used an old one.
406 * that isn't an error (and the unused dn should be freed) */
407 else if (error == -EEXIST)
408 error = 0;
409
410 dnotify_recalc_inode_mask(entry);
411out:
412 spin_unlock(&entry->lock);
413
414 if (destroy)
415 fsnotify_destroy_mark_by_entry(entry);
416
417 fsnotify_recalc_group_mask(dnotify_group);
418
419 mutex_unlock(&dnotify_mark_mutex);
420 fsnotify_put_mark(entry);
421out_err:
422 if (new_entry)
423 fsnotify_put_mark(new_entry);
424 if (dn)
425 kmem_cache_free(dnotify_struct_cache, dn);
426 return error;
181} 427}
182EXPORT_SYMBOL_GPL(dnotify_parent);
183 428
184static int __init dnotify_init(void) 429static int __init dnotify_init(void)
185{ 430{
186 dn_cache = kmem_cache_create("dnotify_cache", 431 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
187 sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL); 432 dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
433
434 dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
435 0, &dnotify_fsnotify_ops);
436 if (IS_ERR(dnotify_group))
437 panic("unable to allocate fsnotify group for dnotify\n");
188 return 0; 438 return 0;
189} 439}
190 440
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 000000000000..ec2f7bd76818
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,186 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#include <linux/dcache.h>
20#include <linux/fs.h>
21#include <linux/init.h>
22#include <linux/module.h>
23#include <linux/srcu.h>
24
25#include <linux/fsnotify_backend.h>
26#include "fsnotify.h"
27
28/*
29 * Clear all of the marks on an inode when it is being evicted from core
30 */
31void __fsnotify_inode_delete(struct inode *inode)
32{
33 fsnotify_clear_marks_by_inode(inode);
34}
35EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
36
37/*
38 * Given an inode, first check if we care what happens to our children. Inotify
39 * and dnotify both tell their parents about events. If we care about any event
40 * on a child we run all of our children and set a dentry flag saying that the
41 * parent cares. Thus when an event happens on a child it can quickly tell if
42 * if there is a need to find a parent and send the event to the parent.
43 */
44void __fsnotify_update_child_dentry_flags(struct inode *inode)
45{
46 struct dentry *alias;
47 int watched;
48
49 if (!S_ISDIR(inode->i_mode))
50 return;
51
52 /* determine if the children should tell inode about their events */
53 watched = fsnotify_inode_watches_children(inode);
54
55 spin_lock(&dcache_lock);
56 /* run all of the dentries associated with this inode. Since this is a
57 * directory, there damn well better only be one item on this list */
58 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
59 struct dentry *child;
60
61 /* run all of the children of the original inode and fix their
62 * d_flags to indicate parental interest (their parent is the
63 * original inode) */
64 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
65 if (!child->d_inode)
66 continue;
67
68 spin_lock(&child->d_lock);
69 if (watched)
70 child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
71 else
72 child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
73 spin_unlock(&child->d_lock);
74 }
75 }
76 spin_unlock(&dcache_lock);
77}
78
79/* Notify this dentry's parent about a child's events. */
80void __fsnotify_parent(struct dentry *dentry, __u32 mask)
81{
82 struct dentry *parent;
83 struct inode *p_inode;
84 bool send = false;
85 bool should_update_children = false;
86
87 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
88 return;
89
90 spin_lock(&dentry->d_lock);
91 parent = dentry->d_parent;
92 p_inode = parent->d_inode;
93
94 if (fsnotify_inode_watches_children(p_inode)) {
95 if (p_inode->i_fsnotify_mask & mask) {
96 dget(parent);
97 send = true;
98 }
99 } else {
100 /*
101 * The parent doesn't care about events on it's children but
102 * at least one child thought it did. We need to run all the
103 * children and update their d_flags to let them know p_inode
104 * doesn't care about them any more.
105 */
106 dget(parent);
107 should_update_children = true;
108 }
109
110 spin_unlock(&dentry->d_lock);
111
112 if (send) {
113 /* we are notifying a parent so come up with the new mask which
114 * specifies these are events which came from a child. */
115 mask |= FS_EVENT_ON_CHILD;
116
117 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
118 dentry->d_name.name, 0);
119 dput(parent);
120 }
121
122 if (unlikely(should_update_children)) {
123 __fsnotify_update_child_dentry_flags(p_inode);
124 dput(parent);
125 }
126}
127EXPORT_SYMBOL_GPL(__fsnotify_parent);
128
129/*
130 * This is the main call to fsnotify. The VFS calls into hook specific functions
131 * in linux/fsnotify.h. Those functions then in turn call here. Here will call
132 * out to all of the registered fsnotify_group. Those groups can then use the
133 * notification event in whatever means they feel necessary.
134 */
135void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
136{
137 struct fsnotify_group *group;
138 struct fsnotify_event *event = NULL;
139 int idx;
140 /* global tests shouldn't care about events on child only the specific event */
141 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
142
143 if (list_empty(&fsnotify_groups))
144 return;
145
146 if (!(test_mask & fsnotify_mask))
147 return;
148
149 if (!(test_mask & to_tell->i_fsnotify_mask))
150 return;
151 /*
152 * SRCU!! the groups list is very very much read only and the path is
153 * very hot. The VAST majority of events are not going to need to do
154 * anything other than walk the list so it's crazy to pre-allocate.
155 */
156 idx = srcu_read_lock(&fsnotify_grp_srcu);
157 list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
158 if (test_mask & group->mask) {
159 if (!group->ops->should_send_event(group, to_tell, mask))
160 continue;
161 if (!event) {
162 event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
163 /* shit, we OOM'd and now we can't tell, maybe
164 * someday someone else will want to do something
165 * here */
166 if (!event)
167 break;
168 }
169 group->ops->handle_event(group, event);
170 }
171 }
172 srcu_read_unlock(&fsnotify_grp_srcu, idx);
173 /*
174 * fsnotify_create_event() took a reference so the event can't be cleaned
175 * up while we are still trying to add it to lists, drop that one.
176 */
177 if (event)
178 fsnotify_put_event(event);
179}
180EXPORT_SYMBOL_GPL(fsnotify);
181
182static __init int fsnotify_init(void)
183{
184 return init_srcu_struct(&fsnotify_grp_srcu);
185}
186subsys_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 000000000000..4dc240824b2d
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,34 @@
1#ifndef __FS_NOTIFY_FSNOTIFY_H_
2#define __FS_NOTIFY_FSNOTIFY_H_
3
4#include <linux/list.h>
5#include <linux/fsnotify.h>
6#include <linux/srcu.h>
7#include <linux/types.h>
8
9/* protects reads of fsnotify_groups */
10extern struct srcu_struct fsnotify_grp_srcu;
11/* all groups which receive fsnotify events */
12extern struct list_head fsnotify_groups;
13/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
14extern __u32 fsnotify_mask;
15
16/* destroy all events sitting in this groups notification queue */
17extern void fsnotify_flush_notify(struct fsnotify_group *group);
18
19/* final kfree of a group */
20extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
21
22/* run the list of all marks associated with inode and flag them to be freed */
23extern void fsnotify_clear_marks_by_inode(struct inode *inode);
24/*
25 * update the dentry->d_flags of all of inode's children to indicate if inode cares
26 * about events that happen to its children.
27 */
28extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
29
30/* allocate and destroy and event holder to attach events to notification/access queues */
31extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
32extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
33
34#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 000000000000..0e1677144bc5
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,254 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#include <linux/list.h>
20#include <linux/mutex.h>
21#include <linux/slab.h>
22#include <linux/srcu.h>
23#include <linux/rculist.h>
24#include <linux/wait.h>
25
26#include <linux/fsnotify_backend.h>
27#include "fsnotify.h"
28
29#include <asm/atomic.h>
30
31/* protects writes to fsnotify_groups and fsnotify_mask */
32static DEFINE_MUTEX(fsnotify_grp_mutex);
33/* protects reads while running the fsnotify_groups list */
34struct srcu_struct fsnotify_grp_srcu;
35/* all groups registered to receive filesystem notifications */
36LIST_HEAD(fsnotify_groups);
37/* bitwise OR of all events (FS_*) interesting to some group on this system */
38__u32 fsnotify_mask;
39
40/*
41 * When a new group registers or changes it's set of interesting events
42 * this function updates the fsnotify_mask to contain all interesting events
43 */
44void fsnotify_recalc_global_mask(void)
45{
46 struct fsnotify_group *group;
47 __u32 mask = 0;
48 int idx;
49
50 idx = srcu_read_lock(&fsnotify_grp_srcu);
51 list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
52 mask |= group->mask;
53 srcu_read_unlock(&fsnotify_grp_srcu, idx);
54 fsnotify_mask = mask;
55}
56
57/*
58 * Update the group->mask by running all of the marks associated with this
59 * group and finding the bitwise | of all of the mark->mask. If we change
60 * the group->mask we need to update the global mask of events interesting
61 * to the system.
62 */
63void fsnotify_recalc_group_mask(struct fsnotify_group *group)
64{
65 __u32 mask = 0;
66 __u32 old_mask = group->mask;
67 struct fsnotify_mark_entry *entry;
68
69 spin_lock(&group->mark_lock);
70 list_for_each_entry(entry, &group->mark_entries, g_list)
71 mask |= entry->mask;
72 spin_unlock(&group->mark_lock);
73
74 group->mask = mask;
75
76 if (old_mask != mask)
77 fsnotify_recalc_global_mask();
78}
79
80/*
81 * Take a reference to a group so things found under the fsnotify_grp_mutex
82 * can't get freed under us
83 */
84static void fsnotify_get_group(struct fsnotify_group *group)
85{
86 atomic_inc(&group->refcnt);
87}
88
89/*
90 * Final freeing of a group
91 */
92void fsnotify_final_destroy_group(struct fsnotify_group *group)
93{
94 /* clear the notification queue of all events */
95 fsnotify_flush_notify(group);
96
97 if (group->ops->free_group_priv)
98 group->ops->free_group_priv(group);
99
100 kfree(group);
101}
102
103/*
104 * Trying to get rid of a group. We need to first get rid of any outstanding
105 * allocations and then free the group. Remember that fsnotify_clear_marks_by_group
106 * could miss marks that are being freed by inode and those marks could still
107 * hold a reference to this group (via group->num_marks) If we get into that
108 * situtation, the fsnotify_final_destroy_group will get called when that final
109 * mark is freed.
110 */
111static void fsnotify_destroy_group(struct fsnotify_group *group)
112{
113 /* clear all inode mark entries for this group */
114 fsnotify_clear_marks_by_group(group);
115
116 /* past the point of no return, matches the initial value of 1 */
117 if (atomic_dec_and_test(&group->num_marks))
118 fsnotify_final_destroy_group(group);
119}
120
121/*
122 * Remove this group from the global list of groups that will get events
123 * this can be done even if there are still references and things still using
124 * this group. This just stops the group from getting new events.
125 */
126static void __fsnotify_evict_group(struct fsnotify_group *group)
127{
128 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
129
130 if (group->on_group_list)
131 list_del_rcu(&group->group_list);
132 group->on_group_list = 0;
133}
134
135/*
136 * Called when a group is no longer interested in getting events. This can be
137 * used if a group is misbehaving or if for some reason a group should no longer
138 * get any filesystem events.
139 */
140void fsnotify_evict_group(struct fsnotify_group *group)
141{
142 mutex_lock(&fsnotify_grp_mutex);
143 __fsnotify_evict_group(group);
144 mutex_unlock(&fsnotify_grp_mutex);
145}
146
147/*
148 * Drop a reference to a group. Free it if it's through.
149 */
150void fsnotify_put_group(struct fsnotify_group *group)
151{
152 if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
153 return;
154
155 /*
156 * OK, now we know that there's no other users *and* we hold mutex,
157 * so no new references will appear
158 */
159 __fsnotify_evict_group(group);
160
161 /*
162 * now it's off the list, so the only thing we might care about is
163 * srcu access....
164 */
165 mutex_unlock(&fsnotify_grp_mutex);
166 synchronize_srcu(&fsnotify_grp_srcu);
167
168 /* and now it is really dead. _Nothing_ could be seeing it */
169 fsnotify_recalc_global_mask();
170 fsnotify_destroy_group(group);
171}
172
173/*
174 * Simply run the fsnotify_groups list and find a group which matches
175 * the given parameters. If a group is found we take a reference to that
176 * group.
177 */
178static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
179 const struct fsnotify_ops *ops)
180{
181 struct fsnotify_group *group_iter;
182 struct fsnotify_group *group = NULL;
183
184 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
185
186 list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
187 if (group_iter->group_num == group_num) {
188 if ((group_iter->mask == mask) &&
189 (group_iter->ops == ops)) {
190 fsnotify_get_group(group_iter);
191 group = group_iter;
192 } else
193 group = ERR_PTR(-EEXIST);
194 }
195 }
196 return group;
197}
198
199/*
200 * Either finds an existing group which matches the group_num, mask, and ops or
201 * creates a new group and adds it to the global group list. In either case we
202 * take a reference for the group returned.
203 */
204struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
205 const struct fsnotify_ops *ops)
206{
207 struct fsnotify_group *group, *tgroup;
208
209 /* very low use, simpler locking if we just always alloc */
210 group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
211 if (!group)
212 return ERR_PTR(-ENOMEM);
213
214 atomic_set(&group->refcnt, 1);
215
216 group->on_group_list = 0;
217 group->group_num = group_num;
218 group->mask = mask;
219
220 mutex_init(&group->notification_mutex);
221 INIT_LIST_HEAD(&group->notification_list);
222 init_waitqueue_head(&group->notification_waitq);
223 group->q_len = 0;
224 group->max_events = UINT_MAX;
225
226 spin_lock_init(&group->mark_lock);
227 atomic_set(&group->num_marks, 0);
228 INIT_LIST_HEAD(&group->mark_entries);
229
230 group->ops = ops;
231
232 mutex_lock(&fsnotify_grp_mutex);
233 tgroup = fsnotify_find_group(group_num, mask, ops);
234 if (tgroup) {
235 /* group already exists */
236 mutex_unlock(&fsnotify_grp_mutex);
237 /* destroy the new one we made */
238 fsnotify_put_group(group);
239 return tgroup;
240 }
241
242 /* group not found, add a new one */
243 list_add_rcu(&group->group_list, &fsnotify_groups);
244 group->on_group_list = 1;
245 /* being on the fsnotify_groups list holds one num_marks */
246 atomic_inc(&group->num_marks);
247
248 mutex_unlock(&fsnotify_grp_mutex);
249
250 if (mask)
251 fsnotify_recalc_global_mask();
252
253 return group;
254}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 000000000000..c8a07c65482b
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,426 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * entry->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * entry->lock protects 2 things, entry->group and entry->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the mark_entries list anchored inside a given group
42 * and each entry is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
47 * given inode and each entry is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark_by_entry)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_mark_entries safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h>
86#include <linux/init.h>
87#include <linux/kernel.h>
88#include <linux/module.h>
89#include <linux/mutex.h>
90#include <linux/slab.h>
91#include <linux/spinlock.h>
92#include <linux/writeback.h> /* for inode_lock */
93
94#include <asm/atomic.h>
95
96#include <linux/fsnotify_backend.h>
97#include "fsnotify.h"
98
99void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
100{
101 atomic_inc(&entry->refcnt);
102}
103
104void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
105{
106 if (atomic_dec_and_test(&entry->refcnt))
107 entry->free_mark(entry);
108}
109
110/*
111 * Recalculate the mask of events relevant to a given inode locked.
112 */
113static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
114{
115 struct fsnotify_mark_entry *entry;
116 struct hlist_node *pos;
117 __u32 new_mask = 0;
118
119 assert_spin_locked(&inode->i_lock);
120
121 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
122 new_mask |= entry->mask;
123 inode->i_fsnotify_mask = new_mask;
124}
125
126/*
127 * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
128 * any notifier is interested in hearing for this inode.
129 */
130void fsnotify_recalc_inode_mask(struct inode *inode)
131{
132 spin_lock(&inode->i_lock);
133 fsnotify_recalc_inode_mask_locked(inode);
134 spin_unlock(&inode->i_lock);
135
136 __fsnotify_update_child_dentry_flags(inode);
137}
138
139/*
140 * Any time a mark is getting freed we end up here.
141 * The caller had better be holding a reference to this mark so we don't actually
142 * do the final put under the entry->lock
143 */
144void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
145{
146 struct fsnotify_group *group;
147 struct inode *inode;
148
149 spin_lock(&entry->lock);
150
151 group = entry->group;
152 inode = entry->inode;
153
154 BUG_ON(group && !inode);
155 BUG_ON(!group && inode);
156
157 /* if !group something else already marked this to die */
158 if (!group) {
159 spin_unlock(&entry->lock);
160 return;
161 }
162
163 /* 1 from caller and 1 for being on i_list/g_list */
164 BUG_ON(atomic_read(&entry->refcnt) < 2);
165
166 spin_lock(&group->mark_lock);
167 spin_lock(&inode->i_lock);
168
169 hlist_del_init(&entry->i_list);
170 entry->inode = NULL;
171
172 list_del_init(&entry->g_list);
173 entry->group = NULL;
174
175 fsnotify_put_mark(entry); /* for i_list and g_list */
176
177 /*
178 * this mark is now off the inode->i_fsnotify_mark_entries list and we
179 * hold the inode->i_lock, so this is the perfect time to update the
180 * inode->i_fsnotify_mask
181 */
182 fsnotify_recalc_inode_mask_locked(inode);
183
184 spin_unlock(&inode->i_lock);
185 spin_unlock(&group->mark_lock);
186 spin_unlock(&entry->lock);
187
188 /*
189 * Some groups like to know that marks are being freed. This is a
190 * callback to the group function to let it know that this entry
191 * is being freed.
192 */
193 if (group->ops->freeing_mark)
194 group->ops->freeing_mark(entry, group);
195
196 /*
197 * __fsnotify_update_child_dentry_flags(inode);
198 *
199 * I really want to call that, but we can't, we have no idea if the inode
200 * still exists the second we drop the entry->lock.
201 *
202 * The next time an event arrive to this inode from one of it's children
203 * __fsnotify_parent will see that the inode doesn't care about it's
204 * children and will update all of these flags then. So really this
205 * is just a lazy update (and could be a perf win...)
206 */
207
208
209 iput(inode);
210
211 /*
212 * it's possible that this group tried to destroy itself, but this
213 * this mark was simultaneously being freed by inode. If that's the
214 * case, we finish freeing the group here.
215 */
216 if (unlikely(atomic_dec_and_test(&group->num_marks)))
217 fsnotify_final_destroy_group(group);
218}
219
220/*
221 * Given a group, destroy all of the marks associated with that group.
222 */
223void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
224{
225 struct fsnotify_mark_entry *lentry, *entry;
226 LIST_HEAD(free_list);
227
228 spin_lock(&group->mark_lock);
229 list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
230 list_add(&entry->free_g_list, &free_list);
231 list_del_init(&entry->g_list);
232 fsnotify_get_mark(entry);
233 }
234 spin_unlock(&group->mark_lock);
235
236 list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
237 fsnotify_destroy_mark_by_entry(entry);
238 fsnotify_put_mark(entry);
239 }
240}
241
242/*
243 * Given an inode, destroy all of the marks associated with that inode.
244 */
245void fsnotify_clear_marks_by_inode(struct inode *inode)
246{
247 struct fsnotify_mark_entry *entry, *lentry;
248 struct hlist_node *pos, *n;
249 LIST_HEAD(free_list);
250
251 spin_lock(&inode->i_lock);
252 hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
253 list_add(&entry->free_i_list, &free_list);
254 hlist_del_init(&entry->i_list);
255 fsnotify_get_mark(entry);
256 }
257 spin_unlock(&inode->i_lock);
258
259 list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
260 fsnotify_destroy_mark_by_entry(entry);
261 fsnotify_put_mark(entry);
262 }
263}
264
265/*
266 * given a group and inode, find the mark associated with that combination.
267 * if found take a reference to that mark and return it, else return NULL
268 */
269struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
270 struct inode *inode)
271{
272 struct fsnotify_mark_entry *entry;
273 struct hlist_node *pos;
274
275 assert_spin_locked(&inode->i_lock);
276
277 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
278 if (entry->group == group) {
279 fsnotify_get_mark(entry);
280 return entry;
281 }
282 }
283 return NULL;
284}
285
286/*
287 * Nothing fancy, just initialize lists and locks and counters.
288 */
289void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
290 void (*free_mark)(struct fsnotify_mark_entry *entry))
291
292{
293 spin_lock_init(&entry->lock);
294 atomic_set(&entry->refcnt, 1);
295 INIT_HLIST_NODE(&entry->i_list);
296 entry->group = NULL;
297 entry->mask = 0;
298 entry->inode = NULL;
299 entry->free_mark = free_mark;
300}
301
302/*
303 * Attach an initialized mark entry to a given group and inode.
304 * These marks may be used for the fsnotify backend to determine which
305 * event types should be delivered to which group and for which inodes.
306 */
307int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
308 struct fsnotify_group *group, struct inode *inode)
309{
310 struct fsnotify_mark_entry *lentry;
311 int ret = 0;
312
313 inode = igrab(inode);
314 if (unlikely(!inode))
315 return -EINVAL;
316
317 /*
318 * LOCKING ORDER!!!!
319 * entry->lock
320 * group->mark_lock
321 * inode->i_lock
322 */
323 spin_lock(&entry->lock);
324 spin_lock(&group->mark_lock);
325 spin_lock(&inode->i_lock);
326
327 entry->group = group;
328 entry->inode = inode;
329
330 lentry = fsnotify_find_mark_entry(group, inode);
331 if (!lentry) {
332 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
333 list_add(&entry->g_list, &group->mark_entries);
334
335 fsnotify_get_mark(entry); /* for i_list and g_list */
336
337 atomic_inc(&group->num_marks);
338
339 fsnotify_recalc_inode_mask_locked(inode);
340 }
341
342 spin_unlock(&inode->i_lock);
343 spin_unlock(&group->mark_lock);
344 spin_unlock(&entry->lock);
345
346 if (lentry) {
347 ret = -EEXIST;
348 iput(inode);
349 fsnotify_put_mark(lentry);
350 } else {
351 __fsnotify_update_child_dentry_flags(inode);
352 }
353
354 return ret;
355}
356
357/**
358 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
359 * @list: list of inodes being unmounted (sb->s_inodes)
360 *
361 * Called with inode_lock held, protecting the unmounting super block's list
362 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
363 * We temporarily drop inode_lock, however, and CAN block.
364 */
365void fsnotify_unmount_inodes(struct list_head *list)
366{
367 struct inode *inode, *next_i, *need_iput = NULL;
368
369 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
370 struct inode *need_iput_tmp;
371
372 /*
373 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
374 * I_WILL_FREE, or I_NEW which is fine because by that point
375 * the inode cannot have any associated watches.
376 */
377 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
378 continue;
379
380 /*
381 * If i_count is zero, the inode cannot have any watches and
382 * doing an __iget/iput with MS_ACTIVE clear would actually
383 * evict all inodes with zero i_count from icache which is
384 * unnecessarily violent and may in fact be illegal to do.
385 */
386 if (!atomic_read(&inode->i_count))
387 continue;
388
389 need_iput_tmp = need_iput;
390 need_iput = NULL;
391
392 /* In case fsnotify_inode_delete() drops a reference. */
393 if (inode != need_iput_tmp)
394 __iget(inode);
395 else
396 need_iput_tmp = NULL;
397
398 /* In case the dropping of a reference would nuke next_i. */
399 if ((&next_i->i_sb_list != list) &&
400 atomic_read(&next_i->i_count) &&
401 !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
402 __iget(next_i);
403 need_iput = next_i;
404 }
405
406 /*
407 * We can safely drop inode_lock here because we hold
408 * references on both inode and next_i. Also no new inodes
409 * will be added since the umount has begun. Finally,
410 * iprune_mutex keeps shrink_icache_memory() away.
411 */
412 spin_unlock(&inode_lock);
413
414 if (need_iput_tmp)
415 iput(need_iput_tmp);
416
417 /* for each watch, send FS_UNMOUNT and then remove it */
418 fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
419
420 fsnotify_inode_delete(inode);
421
422 iput(inode);
423
424 spin_lock(&inode_lock);
425 }
426}
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 446792841023..5356884289a1 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,26 +1,30 @@
1config INOTIFY 1config INOTIFY
2 bool "Inotify file change notification support" 2 bool "Inotify file change notification support"
3 default y 3 default n
4 ---help--- 4 ---help---
5 Say Y here to enable inotify support. Inotify is a file change 5 Say Y here to enable legacy in kernel inotify support. Inotify is a
6 notification system and a replacement for dnotify. Inotify fixes 6 file change notification system. It is a replacement for dnotify.
7 numerous shortcomings in dnotify and introduces several new features 7 This option only provides the legacy inotify in kernel API. There
8 including multiple file events, one-shot support, and unmount 8 are no in tree kernel users of this interface since it is deprecated.
9 notification. 9 You only need this if you are loading an out of tree kernel module
10 that uses inotify.
10 11
11 For more information, see <file:Documentation/filesystems/inotify.txt> 12 For more information, see <file:Documentation/filesystems/inotify.txt>
12 13
13 If unsure, say Y. 14 If unsure, say N.
14 15
15config INOTIFY_USER 16config INOTIFY_USER
16 bool "Inotify support for userspace" 17 bool "Inotify support for userspace"
17 depends on INOTIFY 18 depends on FSNOTIFY
18 default y 19 default y
19 ---help--- 20 ---help---
20 Say Y here to enable inotify support for userspace, including the 21 Say Y here to enable inotify support for userspace, including the
21 associated system calls. Inotify allows monitoring of both files and 22 associated system calls. Inotify allows monitoring of both files and
22 directories via a single open fd. Events are read from the file 23 directories via a single open fd. Events are read from the file
23 descriptor, which is also select()- and poll()-able. 24 descriptor, which is also select()- and poll()-able.
25 Inotify fixes numerous shortcomings in dnotify and introduces several
26 new features including multiple file events, one-shot support, and
27 unmount notification.
24 28
25 For more information, see <file:Documentation/filesystems/inotify.txt> 29 For more information, see <file:Documentation/filesystems/inotify.txt>
26 30
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index e290f3bb9d8d..943828171362 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
1obj-$(CONFIG_INOTIFY) += inotify.o 1obj-$(CONFIG_INOTIFY) += inotify.o
2obj-$(CONFIG_INOTIFY_USER) += inotify_user.o 2obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 220c13f0d73d..40b1cf914ccb 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -32,6 +32,7 @@
32#include <linux/list.h> 32#include <linux/list.h>
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/inotify.h> 34#include <linux/inotify.h>
35#include <linux/fsnotify_backend.h>
35 36
36static atomic_t inotify_cookie; 37static atomic_t inotify_cookie;
37 38
@@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
905 */ 906 */
906static int __init inotify_setup(void) 907static int __init inotify_setup(void)
907{ 908{
909 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
910 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
911 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
912 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
913 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
914 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
915 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
916 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
917 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
918 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
919 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
920 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
921 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
922
923 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
924 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
925 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
926 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
927
908 atomic_set(&inotify_cookie, 0); 928 atomic_set(&inotify_cookie, 0);
909 929
910 return 0; 930 return 0;
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 000000000000..ea2605a58b8a
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,21 @@
1#include <linux/fsnotify_backend.h>
2#include <linux/inotify.h>
3#include <linux/slab.h> /* struct kmem_cache */
4
5extern struct kmem_cache *event_priv_cachep;
6
7struct inotify_event_private_data {
8 struct fsnotify_event_private_data fsnotify_event_priv_data;
9 int wd;
10};
11
12struct inotify_inode_mark_entry {
13 /* fsnotify_mark_entry MUST be the first thing */
14 struct fsnotify_mark_entry fsn_entry;
15 int wd;
16};
17
18extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
19extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
20
21extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 000000000000..7ef75b83247e
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,138 @@
1/*
2 * fs/inotify_user.c - inotify support for userspace
3 *
4 * Authors:
5 * John McCutchan <ttb@tentacle.dhs.org>
6 * Robert Love <rml@novell.com>
7 *
8 * Copyright (C) 2005 John McCutchan
9 * Copyright 2006 Hewlett-Packard Development Company, L.P.
10 *
11 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
12 * inotify was largely rewriten to make use of the fsnotify infrastructure
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License as published by the
16 * Free Software Foundation; either version 2, or (at your option) any
17 * later version.
18 *
19 * This program is distributed in the hope that it will be useful, but
20 * WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * General Public License for more details.
23 */
24
25#include <linux/fs.h> /* struct inode */
26#include <linux/fsnotify_backend.h>
27#include <linux/inotify.h>
28#include <linux/path.h> /* struct path */
29#include <linux/slab.h> /* kmem_* */
30#include <linux/types.h>
31
32#include "inotify.h"
33
34static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
35{
36 struct fsnotify_mark_entry *entry;
37 struct inotify_inode_mark_entry *ientry;
38 struct inode *to_tell;
39 struct inotify_event_private_data *event_priv;
40 struct fsnotify_event_private_data *fsn_event_priv;
41 int wd, ret;
42
43 to_tell = event->to_tell;
44
45 spin_lock(&to_tell->i_lock);
46 entry = fsnotify_find_mark_entry(group, to_tell);
47 spin_unlock(&to_tell->i_lock);
48 /* race with watch removal? We already passes should_send */
49 if (unlikely(!entry))
50 return 0;
51 ientry = container_of(entry, struct inotify_inode_mark_entry,
52 fsn_entry);
53 wd = ientry->wd;
54
55 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
56 if (unlikely(!event_priv))
57 return -ENOMEM;
58
59 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
60
61 fsn_event_priv->group = group;
62 event_priv->wd = wd;
63
64 ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
65 /* EEXIST is not an error */
66 if (ret == -EEXIST)
67 ret = 0;
68
69 /* did event_priv get attached? */
70 if (list_empty(&fsn_event_priv->event_list))
71 inotify_free_event_priv(fsn_event_priv);
72
73 /*
74 * If we hold the entry until after the event is on the queue
75 * IN_IGNORED won't be able to pass this event in the queue
76 */
77 fsnotify_put_mark(entry);
78
79 return ret;
80}
81
82static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
83{
84 inotify_destroy_mark_entry(entry, group);
85}
86
87static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
88{
89 struct fsnotify_mark_entry *entry;
90 bool send;
91
92 spin_lock(&inode->i_lock);
93 entry = fsnotify_find_mark_entry(group, inode);
94 spin_unlock(&inode->i_lock);
95 if (!entry)
96 return false;
97
98 mask = (mask & ~FS_EVENT_ON_CHILD);
99 send = (entry->mask & mask);
100
101 /* find took a reference */
102 fsnotify_put_mark(entry);
103
104 return send;
105}
106
107static int idr_callback(int id, void *p, void *data)
108{
109 BUG();
110 return 0;
111}
112
113static void inotify_free_group_priv(struct fsnotify_group *group)
114{
115 /* ideally the idr is empty and we won't hit the BUG in teh callback */
116 idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
117 idr_remove_all(&group->inotify_data.idr);
118 idr_destroy(&group->inotify_data.idr);
119}
120
121void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
122{
123 struct inotify_event_private_data *event_priv;
124
125
126 event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
127 fsnotify_event_priv_data);
128
129 kmem_cache_free(event_priv_cachep, event_priv);
130}
131
132const struct fsnotify_ops inotify_fsnotify_ops = {
133 .handle_event = inotify_handle_event,
134 .should_send_event = inotify_should_send_event,
135 .free_group_priv = inotify_free_group_priv,
136 .free_event_priv = inotify_free_event_priv,
137 .freeing_mark = inotify_freeing_mark,
138};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1634319e2404..982a412ac5bc 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -8,6 +8,9 @@
8 * Copyright (C) 2005 John McCutchan 8 * Copyright (C) 2005 John McCutchan
9 * Copyright 2006 Hewlett-Packard Development Company, L.P. 9 * Copyright 2006 Hewlett-Packard Development Company, L.P.
10 * 10 *
11 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
12 * inotify was largely rewriten to make use of the fsnotify infrastructure
13 *
11 * This program is free software; you can redistribute it and/or modify it 14 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the 15 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2, or (at your option) any 16 * Free Software Foundation; either version 2, or (at your option) any
@@ -19,94 +22,48 @@
19 * General Public License for more details. 22 * General Public License for more details.
20 */ 23 */
21 24
22#include <linux/kernel.h>
23#include <linux/sched.h>
24#include <linux/slab.h>
25#include <linux/fs.h>
26#include <linux/file.h> 25#include <linux/file.h>
27#include <linux/mount.h> 26#include <linux/fs.h> /* struct inode */
28#include <linux/namei.h> 27#include <linux/fsnotify_backend.h>
29#include <linux/poll.h> 28#include <linux/idr.h>
30#include <linux/init.h> 29#include <linux/init.h> /* module_init */
31#include <linux/list.h>
32#include <linux/inotify.h> 30#include <linux/inotify.h>
31#include <linux/kernel.h> /* roundup() */
32#include <linux/magic.h> /* superblock magic number */
33#include <linux/mount.h> /* mntget */
34#include <linux/namei.h> /* LOOKUP_FOLLOW */
35#include <linux/path.h> /* struct path */
36#include <linux/sched.h> /* struct user */
37#include <linux/slab.h> /* struct kmem_cache */
33#include <linux/syscalls.h> 38#include <linux/syscalls.h>
34#include <linux/magic.h> 39#include <linux/types.h>
40#include <linux/uaccess.h>
41#include <linux/poll.h>
42#include <linux/wait.h>
35 43
36#include <asm/ioctls.h> 44#include "inotify.h"
37 45
38static struct kmem_cache *watch_cachep __read_mostly; 46#include <asm/ioctls.h>
39static struct kmem_cache *event_cachep __read_mostly;
40 47
41static struct vfsmount *inotify_mnt __read_mostly; 48static struct vfsmount *inotify_mnt __read_mostly;
42 49
50/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */
51static struct inotify_event nul_inotify_event;
52
43/* these are configurable via /proc/sys/fs/inotify/ */ 53/* these are configurable via /proc/sys/fs/inotify/ */
44static int inotify_max_user_instances __read_mostly; 54static int inotify_max_user_instances __read_mostly;
45static int inotify_max_user_watches __read_mostly;
46static int inotify_max_queued_events __read_mostly; 55static int inotify_max_queued_events __read_mostly;
56int inotify_max_user_watches __read_mostly;
47 57
48/* 58static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
49 * Lock ordering: 59struct kmem_cache *event_priv_cachep __read_mostly;
50 * 60static struct fsnotify_event *inotify_ignored_event;
51 * inotify_dev->up_mutex (ensures we don't re-add the same watch)
52 * inode->inotify_mutex (protects inode's watch list)
53 * inotify_handle->mutex (protects inotify_handle's watch list)
54 * inotify_dev->ev_mutex (protects device's event queue)
55 */
56 61
57/* 62/*
58 * Lifetimes of the main data structures: 63 * When inotify registers a new group it increments this and uses that
59 * 64 * value as an offset to set the fsnotify group "name" and priority.
60 * inotify_device: Lifetime is managed by reference count, from
61 * sys_inotify_init() until release. Additional references can bump the count
62 * via get_inotify_dev() and drop the count via put_inotify_dev().
63 *
64 * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
65 * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
66 * first event, or to inotify_destroy().
67 */ 65 */
68 66static atomic_t inotify_grp_num;
69/*
70 * struct inotify_device - represents an inotify instance
71 *
72 * This structure is protected by the mutex 'mutex'.
73 */
74struct inotify_device {
75 wait_queue_head_t wq; /* wait queue for i/o */
76 struct mutex ev_mutex; /* protects event queue */
77 struct mutex up_mutex; /* synchronizes watch updates */
78 struct list_head events; /* list of queued events */
79 struct user_struct *user; /* user who opened this dev */
80 struct inotify_handle *ih; /* inotify handle */
81 struct fasync_struct *fa; /* async notification */
82 atomic_t count; /* reference count */
83 unsigned int queue_size; /* size of the queue (bytes) */
84 unsigned int event_count; /* number of pending events */
85 unsigned int max_events; /* maximum number of events */
86};
87
88/*
89 * struct inotify_kernel_event - An inotify event, originating from a watch and
90 * queued for user-space. A list of these is attached to each instance of the
91 * device. In read(), this list is walked and all events that can fit in the
92 * buffer are returned.
93 *
94 * Protected by dev->ev_mutex of the device in which we are queued.
95 */
96struct inotify_kernel_event {
97 struct inotify_event event; /* the user-space event */
98 struct list_head list; /* entry in inotify_device's list */
99 char *name; /* filename, if any */
100};
101
102/*
103 * struct inotify_user_watch - our version of an inotify_watch, we add
104 * a reference to the associated inotify_device.
105 */
106struct inotify_user_watch {
107 struct inotify_device *dev; /* associated device */
108 struct inotify_watch wdata; /* inotify watch data */
109};
110 67
111#ifdef CONFIG_SYSCTL 68#ifdef CONFIG_SYSCTL
112 69
@@ -149,280 +106,36 @@ ctl_table inotify_table[] = {
149}; 106};
150#endif /* CONFIG_SYSCTL */ 107#endif /* CONFIG_SYSCTL */
151 108
152static inline void get_inotify_dev(struct inotify_device *dev) 109static inline __u32 inotify_arg_to_mask(u32 arg)
153{
154 atomic_inc(&dev->count);
155}
156
157static inline void put_inotify_dev(struct inotify_device *dev)
158{
159 if (atomic_dec_and_test(&dev->count)) {
160 atomic_dec(&dev->user->inotify_devs);
161 free_uid(dev->user);
162 kfree(dev);
163 }
164}
165
166/*
167 * free_inotify_user_watch - cleans up the watch and its references
168 */
169static void free_inotify_user_watch(struct inotify_watch *w)
170{
171 struct inotify_user_watch *watch;
172 struct inotify_device *dev;
173
174 watch = container_of(w, struct inotify_user_watch, wdata);
175 dev = watch->dev;
176
177 atomic_dec(&dev->user->inotify_watches);
178 put_inotify_dev(dev);
179 kmem_cache_free(watch_cachep, watch);
180}
181
182/*
183 * kernel_event - create a new kernel event with the given parameters
184 *
185 * This function can sleep.
186 */
187static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
188 const char *name)
189{
190 struct inotify_kernel_event *kevent;
191
192 kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
193 if (unlikely(!kevent))
194 return NULL;
195
196 /* we hand this out to user-space, so zero it just in case */
197 memset(&kevent->event, 0, sizeof(struct inotify_event));
198
199 kevent->event.wd = wd;
200 kevent->event.mask = mask;
201 kevent->event.cookie = cookie;
202
203 INIT_LIST_HEAD(&kevent->list);
204
205 if (name) {
206 size_t len, rem, event_size = sizeof(struct inotify_event);
207
208 /*
209 * We need to pad the filename so as to properly align an
210 * array of inotify_event structures. Because the structure is
211 * small and the common case is a small filename, we just round
212 * up to the next multiple of the structure's sizeof. This is
213 * simple and safe for all architectures.
214 */
215 len = strlen(name) + 1;
216 rem = event_size - len;
217 if (len > event_size) {
218 rem = event_size - (len % event_size);
219 if (len % event_size == 0)
220 rem = 0;
221 }
222
223 kevent->name = kmalloc(len + rem, GFP_NOFS);
224 if (unlikely(!kevent->name)) {
225 kmem_cache_free(event_cachep, kevent);
226 return NULL;
227 }
228 memcpy(kevent->name, name, len);
229 if (rem)
230 memset(kevent->name + len, 0, rem);
231 kevent->event.len = len + rem;
232 } else {
233 kevent->event.len = 0;
234 kevent->name = NULL;
235 }
236
237 return kevent;
238}
239
240/*
241 * inotify_dev_get_event - return the next event in the given dev's queue
242 *
243 * Caller must hold dev->ev_mutex.
244 */
245static inline struct inotify_kernel_event *
246inotify_dev_get_event(struct inotify_device *dev)
247{
248 return list_entry(dev->events.next, struct inotify_kernel_event, list);
249}
250
251/*
252 * inotify_dev_get_last_event - return the last event in the given dev's queue
253 *
254 * Caller must hold dev->ev_mutex.
255 */
256static inline struct inotify_kernel_event *
257inotify_dev_get_last_event(struct inotify_device *dev)
258{ 110{
259 if (list_empty(&dev->events)) 111 __u32 mask;
260 return NULL;
261 return list_entry(dev->events.prev, struct inotify_kernel_event, list);
262}
263 112
264/* 113 /* everything should accept their own ignored and cares about children */
265 * inotify_dev_queue_event - event handler registered with core inotify, adds 114 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
266 * a new event to the given device
267 *
268 * Can sleep (calls kernel_event()).
269 */
270static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
271 u32 cookie, const char *name,
272 struct inode *ignored)
273{
274 struct inotify_user_watch *watch;
275 struct inotify_device *dev;
276 struct inotify_kernel_event *kevent, *last;
277 115
278 watch = container_of(w, struct inotify_user_watch, wdata); 116 /* mask off the flags used to open the fd */
279 dev = watch->dev; 117 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
280 118
281 mutex_lock(&dev->ev_mutex); 119 return mask;
282
283 /* we can safely put the watch as we don't reference it while
284 * generating the event
285 */
286 if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
287 put_inotify_watch(w); /* final put */
288
289 /* coalescing: drop this event if it is a dupe of the previous */
290 last = inotify_dev_get_last_event(dev);
291 if (last && last->event.mask == mask && last->event.wd == wd &&
292 last->event.cookie == cookie) {
293 const char *lastname = last->name;
294
295 if (!name && !lastname)
296 goto out;
297 if (name && lastname && !strcmp(lastname, name))
298 goto out;
299 }
300
301 /* the queue overflowed and we already sent the Q_OVERFLOW event */
302 if (unlikely(dev->event_count > dev->max_events))
303 goto out;
304
305 /* if the queue overflows, we need to notify user space */
306 if (unlikely(dev->event_count == dev->max_events))
307 kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
308 else
309 kevent = kernel_event(wd, mask, cookie, name);
310
311 if (unlikely(!kevent))
312 goto out;
313
314 /* queue the event and wake up anyone waiting */
315 dev->event_count++;
316 dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
317 list_add_tail(&kevent->list, &dev->events);
318 wake_up_interruptible(&dev->wq);
319 kill_fasync(&dev->fa, SIGIO, POLL_IN);
320
321out:
322 mutex_unlock(&dev->ev_mutex);
323}
324
325/*
326 * remove_kevent - cleans up the given kevent
327 *
328 * Caller must hold dev->ev_mutex.
329 */
330static void remove_kevent(struct inotify_device *dev,
331 struct inotify_kernel_event *kevent)
332{
333 list_del(&kevent->list);
334
335 dev->event_count--;
336 dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
337}
338
339/*
340 * free_kevent - frees the given kevent.
341 */
342static void free_kevent(struct inotify_kernel_event *kevent)
343{
344 kfree(kevent->name);
345 kmem_cache_free(event_cachep, kevent);
346}
347
348/*
349 * inotify_dev_event_dequeue - destroy an event on the given device
350 *
351 * Caller must hold dev->ev_mutex.
352 */
353static void inotify_dev_event_dequeue(struct inotify_device *dev)
354{
355 if (!list_empty(&dev->events)) {
356 struct inotify_kernel_event *kevent;
357 kevent = inotify_dev_get_event(dev);
358 remove_kevent(dev, kevent);
359 free_kevent(kevent);
360 }
361}
362
363/*
364 * find_inode - resolve a user-given path to a specific inode
365 */
366static int find_inode(const char __user *dirname, struct path *path,
367 unsigned flags)
368{
369 int error;
370
371 error = user_path_at(AT_FDCWD, dirname, flags, path);
372 if (error)
373 return error;
374 /* you can only watch an inode if you have read permissions on it */
375 error = inode_permission(path->dentry->d_inode, MAY_READ);
376 if (error)
377 path_put(path);
378 return error;
379} 120}
380 121
381/* 122static inline u32 inotify_mask_to_arg(__u32 mask)
382 * create_watch - creates a watch on the given device.
383 *
384 * Callers must hold dev->up_mutex.
385 */
386static int create_watch(struct inotify_device *dev, struct inode *inode,
387 u32 mask)
388{ 123{
389 struct inotify_user_watch *watch; 124 return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
390 int ret; 125 IN_Q_OVERFLOW);
391
392 if (atomic_read(&dev->user->inotify_watches) >=
393 inotify_max_user_watches)
394 return -ENOSPC;
395
396 watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
397 if (unlikely(!watch))
398 return -ENOMEM;
399
400 /* save a reference to device and bump the count to make it official */
401 get_inotify_dev(dev);
402 watch->dev = dev;
403
404 atomic_inc(&dev->user->inotify_watches);
405
406 inotify_init_watch(&watch->wdata);
407 ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
408 if (ret < 0)
409 free_inotify_user_watch(&watch->wdata);
410
411 return ret;
412} 126}
413 127
414/* Device Interface */ 128/* intofiy userspace file descriptor functions */
415
416static unsigned int inotify_poll(struct file *file, poll_table *wait) 129static unsigned int inotify_poll(struct file *file, poll_table *wait)
417{ 130{
418 struct inotify_device *dev = file->private_data; 131 struct fsnotify_group *group = file->private_data;
419 int ret = 0; 132 int ret = 0;
420 133
421 poll_wait(file, &dev->wq, wait); 134 poll_wait(file, &group->notification_waitq, wait);
422 mutex_lock(&dev->ev_mutex); 135 mutex_lock(&group->notification_mutex);
423 if (!list_empty(&dev->events)) 136 if (!fsnotify_notify_queue_is_empty(group))
424 ret = POLLIN | POLLRDNORM; 137 ret = POLLIN | POLLRDNORM;
425 mutex_unlock(&dev->ev_mutex); 138 mutex_unlock(&group->notification_mutex);
426 139
427 return ret; 140 return ret;
428} 141}
@@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
432 * enough to fit in "count". Return an error pointer if 145 * enough to fit in "count". Return an error pointer if
433 * not large enough. 146 * not large enough.
434 * 147 *
435 * Called with the device ev_mutex held. 148 * Called with the group->notification_mutex held.
436 */ 149 */
437static struct inotify_kernel_event *get_one_event(struct inotify_device *dev, 150static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
438 size_t count) 151 size_t count)
439{ 152{
440 size_t event_size = sizeof(struct inotify_event); 153 size_t event_size = sizeof(struct inotify_event);
441 struct inotify_kernel_event *kevent; 154 struct fsnotify_event *event;
442 155
443 if (list_empty(&dev->events)) 156 if (fsnotify_notify_queue_is_empty(group))
444 return NULL; 157 return NULL;
445 158
446 kevent = inotify_dev_get_event(dev); 159 event = fsnotify_peek_notify_event(group);
447 if (kevent->name) 160
448 event_size += kevent->event.len; 161 event_size += roundup(event->name_len, event_size);
449 162
450 if (event_size > count) 163 if (event_size > count)
451 return ERR_PTR(-EINVAL); 164 return ERR_PTR(-EINVAL);
452 165
453 remove_kevent(dev, kevent); 166 /* held the notification_mutex the whole time, so this is the
454 return kevent; 167 * same event we peeked above */
168 fsnotify_remove_notify_event(group);
169
170 return event;
455} 171}
456 172
457/* 173/*
@@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
460 * We already checked that the event size is smaller than the 176 * We already checked that the event size is smaller than the
461 * buffer we had in "get_one_event()" above. 177 * buffer we had in "get_one_event()" above.
462 */ 178 */
463static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent, 179static ssize_t copy_event_to_user(struct fsnotify_group *group,
180 struct fsnotify_event *event,
464 char __user *buf) 181 char __user *buf)
465{ 182{
183 struct inotify_event inotify_event;
184 struct fsnotify_event_private_data *fsn_priv;
185 struct inotify_event_private_data *priv;
466 size_t event_size = sizeof(struct inotify_event); 186 size_t event_size = sizeof(struct inotify_event);
187 size_t name_len;
188
189 /* we get the inotify watch descriptor from the event private data */
190 spin_lock(&event->lock);
191 fsn_priv = fsnotify_remove_priv_from_event(group, event);
192 spin_unlock(&event->lock);
193
194 if (!fsn_priv)
195 inotify_event.wd = -1;
196 else {
197 priv = container_of(fsn_priv, struct inotify_event_private_data,
198 fsnotify_event_priv_data);
199 inotify_event.wd = priv->wd;
200 inotify_free_event_priv(fsn_priv);
201 }
202
203 /* round up event->name_len so it is a multiple of event_size */
204 name_len = roundup(event->name_len, event_size);
205 inotify_event.len = name_len;
206
207 inotify_event.mask = inotify_mask_to_arg(event->mask);
208 inotify_event.cookie = event->sync_cookie;
467 209
468 if (copy_to_user(buf, &kevent->event, event_size)) 210 /* send the main event */
211 if (copy_to_user(buf, &inotify_event, event_size))
469 return -EFAULT; 212 return -EFAULT;
470 213
471 if (kevent->name) { 214 buf += event_size;
472 buf += event_size;
473 215
474 if (copy_to_user(buf, kevent->name, kevent->event.len)) 216 /*
217 * fsnotify only stores the pathname, so here we have to send the pathname
218 * and then pad that pathname out to a multiple of sizeof(inotify_event)
219 * with zeros. I get my zeros from the nul_inotify_event.
220 */
221 if (name_len) {
222 unsigned int len_to_zero = name_len - event->name_len;
223 /* copy the path name */
224 if (copy_to_user(buf, event->file_name, event->name_len))
475 return -EFAULT; 225 return -EFAULT;
226 buf += event->name_len;
476 227
477 event_size += kevent->event.len; 228 /* fill userspace with 0's from nul_inotify_event */
229 if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
230 return -EFAULT;
231 buf += len_to_zero;
232 event_size += name_len;
478 } 233 }
234
479 return event_size; 235 return event_size;
480} 236}
481 237
482static ssize_t inotify_read(struct file *file, char __user *buf, 238static ssize_t inotify_read(struct file *file, char __user *buf,
483 size_t count, loff_t *pos) 239 size_t count, loff_t *pos)
484{ 240{
485 struct inotify_device *dev; 241 struct fsnotify_group *group;
242 struct fsnotify_event *kevent;
486 char __user *start; 243 char __user *start;
487 int ret; 244 int ret;
488 DEFINE_WAIT(wait); 245 DEFINE_WAIT(wait);
489 246
490 start = buf; 247 start = buf;
491 dev = file->private_data; 248 group = file->private_data;
492 249
493 while (1) { 250 while (1) {
494 struct inotify_kernel_event *kevent; 251 prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
495 252
496 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); 253 mutex_lock(&group->notification_mutex);
497 254 kevent = get_one_event(group, count);
498 mutex_lock(&dev->ev_mutex); 255 mutex_unlock(&group->notification_mutex);
499 kevent = get_one_event(dev, count);
500 mutex_unlock(&dev->ev_mutex);
501 256
502 if (kevent) { 257 if (kevent) {
503 ret = PTR_ERR(kevent); 258 ret = PTR_ERR(kevent);
504 if (IS_ERR(kevent)) 259 if (IS_ERR(kevent))
505 break; 260 break;
506 ret = copy_event_to_user(kevent, buf); 261 ret = copy_event_to_user(group, kevent, buf);
507 free_kevent(kevent); 262 fsnotify_put_event(kevent);
508 if (ret < 0) 263 if (ret < 0)
509 break; 264 break;
510 buf += ret; 265 buf += ret;
@@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
525 schedule(); 280 schedule();
526 } 281 }
527 282
528 finish_wait(&dev->wq, &wait); 283 finish_wait(&group->notification_waitq, &wait);
529 if (start != buf && ret != -EFAULT) 284 if (start != buf && ret != -EFAULT)
530 ret = buf - start; 285 ret = buf - start;
531 return ret; 286 return ret;
@@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
533 288
534static int inotify_fasync(int fd, struct file *file, int on) 289static int inotify_fasync(int fd, struct file *file, int on)
535{ 290{
536 struct inotify_device *dev = file->private_data; 291 struct fsnotify_group *group = file->private_data;
537 292
538 return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO; 293 return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
539} 294}
540 295
541static int inotify_release(struct inode *ignored, struct file *file) 296static int inotify_release(struct inode *ignored, struct file *file)
542{ 297{
543 struct inotify_device *dev = file->private_data; 298 struct fsnotify_group *group = file->private_data;
544
545 inotify_destroy(dev->ih);
546 299
547 /* destroy all of the events on this device */ 300 fsnotify_clear_marks_by_group(group);
548 mutex_lock(&dev->ev_mutex);
549 while (!list_empty(&dev->events))
550 inotify_dev_event_dequeue(dev);
551 mutex_unlock(&dev->ev_mutex);
552 301
553 /* free this device: the put matching the get in inotify_init() */ 302 /* free this group, matching get was inotify_init->fsnotify_obtain_group */
554 put_inotify_dev(dev); 303 fsnotify_put_group(group);
555 304
556 return 0; 305 return 0;
557} 306}
@@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
559static long inotify_ioctl(struct file *file, unsigned int cmd, 308static long inotify_ioctl(struct file *file, unsigned int cmd,
560 unsigned long arg) 309 unsigned long arg)
561{ 310{
562 struct inotify_device *dev; 311 struct fsnotify_group *group;
312 struct fsnotify_event_holder *holder;
313 struct fsnotify_event *event;
563 void __user *p; 314 void __user *p;
564 int ret = -ENOTTY; 315 int ret = -ENOTTY;
316 size_t send_len = 0;
565 317
566 dev = file->private_data; 318 group = file->private_data;
567 p = (void __user *) arg; 319 p = (void __user *) arg;
568 320
569 switch (cmd) { 321 switch (cmd) {
570 case FIONREAD: 322 case FIONREAD:
571 ret = put_user(dev->queue_size, (int __user *) p); 323 mutex_lock(&group->notification_mutex);
324 list_for_each_entry(holder, &group->notification_list, event_list) {
325 event = holder->event;
326 send_len += sizeof(struct inotify_event);
327 send_len += roundup(event->name_len,
328 sizeof(struct inotify_event));
329 }
330 mutex_unlock(&group->notification_mutex);
331 ret = put_user(send_len, (int __user *) p);
572 break; 332 break;
573 } 333 }
574 334
@@ -576,23 +336,233 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
576} 336}
577 337
578static const struct file_operations inotify_fops = { 338static const struct file_operations inotify_fops = {
579 .poll = inotify_poll, 339 .poll = inotify_poll,
580 .read = inotify_read, 340 .read = inotify_read,
581 .fasync = inotify_fasync, 341 .fasync = inotify_fasync,
582 .release = inotify_release, 342 .release = inotify_release,
583 .unlocked_ioctl = inotify_ioctl, 343 .unlocked_ioctl = inotify_ioctl,
584 .compat_ioctl = inotify_ioctl, 344 .compat_ioctl = inotify_ioctl,
585}; 345};
586 346
587static const struct inotify_operations inotify_user_ops = {
588 .handle_event = inotify_dev_queue_event,
589 .destroy_watch = free_inotify_user_watch,
590};
591 347
348/*
349 * find_inode - resolve a user-given path to a specific inode
350 */
351static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
352{
353 int error;
354
355 error = user_path_at(AT_FDCWD, dirname, flags, path);
356 if (error)
357 return error;
358 /* you can only watch an inode if you have read permissions on it */
359 error = inode_permission(path->dentry->d_inode, MAY_READ);
360 if (error)
361 path_put(path);
362 return error;
363}
364
365/*
366 * When, for whatever reason, inotify is done with a mark (or what used to be a
367 * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
368 * for the given wd.
369 *
370 * There is a bit of recursion here. The loop looks like:
371 * inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
372 * inotify_freeing_mark -> inotify_destory_mark_entry -> restart
373 * But the loop is broken in 2 places. fsnotify_destroy_mark_by_entry sets
374 * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
375 * test below will not call back to fsnotify again. But even if that test wasn't
376 * there this would still be safe since fsnotify_destroy_mark_by_entry() is
377 * safe from recursion.
378 */
379void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
380{
381 struct inotify_inode_mark_entry *ientry;
382 struct inotify_event_private_data *event_priv;
383 struct fsnotify_event_private_data *fsn_event_priv;
384 struct fsnotify_group *egroup;
385 struct idr *idr;
386
387 spin_lock(&entry->lock);
388 egroup = entry->group;
389
390 /* if egroup we aren't really done and something might still send events
391 * for this inode, on the callback we'll send the IN_IGNORED */
392 if (egroup) {
393 spin_unlock(&entry->lock);
394 fsnotify_destroy_mark_by_entry(entry);
395 return;
396 }
397 spin_unlock(&entry->lock);
398
399 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
400
401 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
402 if (unlikely(!event_priv))
403 goto skip_send_ignore;
404
405 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
406
407 fsn_event_priv->group = group;
408 event_priv->wd = ientry->wd;
409
410 fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
411
412 /* did the private data get added? */
413 if (list_empty(&fsn_event_priv->event_list))
414 inotify_free_event_priv(fsn_event_priv);
415
416skip_send_ignore:
417
418 /* remove this entry from the idr */
419 spin_lock(&group->inotify_data.idr_lock);
420 idr = &group->inotify_data.idr;
421 idr_remove(idr, ientry->wd);
422 spin_unlock(&group->inotify_data.idr_lock);
423
424 /* removed from idr, drop that reference */
425 fsnotify_put_mark(entry);
426}
427
428/* ding dong the mark is dead */
429static void inotify_free_mark(struct fsnotify_mark_entry *entry)
430{
431 struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
432
433 kmem_cache_free(inotify_inode_mark_cachep, ientry);
434}
435
436static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
437{
438 struct fsnotify_mark_entry *entry = NULL;
439 struct inotify_inode_mark_entry *ientry;
440 int ret = 0;
441 int add = (arg & IN_MASK_ADD);
442 __u32 mask;
443 __u32 old_mask, new_mask;
444
445 /* don't allow invalid bits: we don't want flags set */
446 mask = inotify_arg_to_mask(arg);
447 if (unlikely(!mask))
448 return -EINVAL;
449
450 ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
451 if (unlikely(!ientry))
452 return -ENOMEM;
453 /* we set the mask at the end after attaching it */
454 fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
455 ientry->wd = 0;
456
457find_entry:
458 spin_lock(&inode->i_lock);
459 entry = fsnotify_find_mark_entry(group, inode);
460 spin_unlock(&inode->i_lock);
461 if (entry) {
462 kmem_cache_free(inotify_inode_mark_cachep, ientry);
463 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
464 } else {
465 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
466 ret = -ENOSPC;
467 goto out_err;
468 }
469
470 ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
471 if (ret == -EEXIST)
472 goto find_entry;
473 else if (ret)
474 goto out_err;
475
476 entry = &ientry->fsn_entry;
477retry:
478 ret = -ENOMEM;
479 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
480 goto out_err;
481
482 spin_lock(&group->inotify_data.idr_lock);
483 /* if entry is added to the idr we keep the reference obtained
484 * through fsnotify_mark_add. remember to drop this reference
485 * when entry is removed from idr */
486 ret = idr_get_new_above(&group->inotify_data.idr, entry,
487 ++group->inotify_data.last_wd,
488 &ientry->wd);
489 spin_unlock(&group->inotify_data.idr_lock);
490 if (ret) {
491 if (ret == -EAGAIN)
492 goto retry;
493 goto out_err;
494 }
495 atomic_inc(&group->inotify_data.user->inotify_watches);
496 }
497
498 spin_lock(&entry->lock);
499
500 old_mask = entry->mask;
501 if (add) {
502 entry->mask |= mask;
503 new_mask = entry->mask;
504 } else {
505 entry->mask = mask;
506 new_mask = entry->mask;
507 }
508
509 spin_unlock(&entry->lock);
510
511 if (old_mask != new_mask) {
512 /* more bits in old than in new? */
513 int dropped = (old_mask & ~new_mask);
514 /* more bits in this entry than the inode's mask? */
515 int do_inode = (new_mask & ~inode->i_fsnotify_mask);
516 /* more bits in this entry than the group? */
517 int do_group = (new_mask & ~group->mask);
518
519 /* update the inode with this new entry */
520 if (dropped || do_inode)
521 fsnotify_recalc_inode_mask(inode);
522
523 /* update the group mask with the new mask */
524 if (dropped || do_group)
525 fsnotify_recalc_group_mask(group);
526 }
527
528 return ientry->wd;
529
530out_err:
531 /* see this isn't supposed to happen, just kill the watch */
532 if (entry) {
533 fsnotify_destroy_mark_by_entry(entry);
534 fsnotify_put_mark(entry);
535 }
536 return ret;
537}
538
539static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
540{
541 struct fsnotify_group *group;
542 unsigned int grp_num;
543
544 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
545 grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
546 group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
547 if (IS_ERR(group))
548 return group;
549
550 group->max_events = max_events;
551
552 spin_lock_init(&group->inotify_data.idr_lock);
553 idr_init(&group->inotify_data.idr);
554 group->inotify_data.last_wd = 0;
555 group->inotify_data.user = user;
556 group->inotify_data.fa = NULL;
557
558 return group;
559}
560
561
562/* inotify syscalls */
592SYSCALL_DEFINE1(inotify_init1, int, flags) 563SYSCALL_DEFINE1(inotify_init1, int, flags)
593{ 564{
594 struct inotify_device *dev; 565 struct fsnotify_group *group;
595 struct inotify_handle *ih;
596 struct user_struct *user; 566 struct user_struct *user;
597 struct file *filp; 567 struct file *filp;
598 int fd, ret; 568 int fd, ret;
@@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
621 goto out_free_uid; 591 goto out_free_uid;
622 } 592 }
623 593
624 dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL); 594 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
625 if (unlikely(!dev)) { 595 group = inotify_new_group(user, inotify_max_queued_events);
626 ret = -ENOMEM; 596 if (IS_ERR(group)) {
597 ret = PTR_ERR(group);
627 goto out_free_uid; 598 goto out_free_uid;
628 } 599 }
629 600
630 ih = inotify_init(&inotify_user_ops);
631 if (IS_ERR(ih)) {
632 ret = PTR_ERR(ih);
633 goto out_free_dev;
634 }
635 dev->ih = ih;
636 dev->fa = NULL;
637
638 filp->f_op = &inotify_fops; 601 filp->f_op = &inotify_fops;
639 filp->f_path.mnt = mntget(inotify_mnt); 602 filp->f_path.mnt = mntget(inotify_mnt);
640 filp->f_path.dentry = dget(inotify_mnt->mnt_root); 603 filp->f_path.dentry = dget(inotify_mnt->mnt_root);
641 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; 604 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
642 filp->f_mode = FMODE_READ; 605 filp->f_mode = FMODE_READ;
643 filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); 606 filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
644 filp->private_data = dev; 607 filp->private_data = group;
645 608
646 INIT_LIST_HEAD(&dev->events);
647 init_waitqueue_head(&dev->wq);
648 mutex_init(&dev->ev_mutex);
649 mutex_init(&dev->up_mutex);
650 dev->event_count = 0;
651 dev->queue_size = 0;
652 dev->max_events = inotify_max_queued_events;
653 dev->user = user;
654 atomic_set(&dev->count, 0);
655
656 get_inotify_dev(dev);
657 atomic_inc(&user->inotify_devs); 609 atomic_inc(&user->inotify_devs);
610
658 fd_install(fd, filp); 611 fd_install(fd, filp);
659 612
660 return fd; 613 return fd;
661out_free_dev: 614
662 kfree(dev);
663out_free_uid: 615out_free_uid:
664 free_uid(user); 616 free_uid(user);
665 put_filp(filp); 617 put_filp(filp);
@@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init)
676SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, 628SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
677 u32, mask) 629 u32, mask)
678{ 630{
631 struct fsnotify_group *group;
679 struct inode *inode; 632 struct inode *inode;
680 struct inotify_device *dev;
681 struct path path; 633 struct path path;
682 struct file *filp; 634 struct file *filp;
683 int ret, fput_needed; 635 int ret, fput_needed;
@@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
698 if (mask & IN_ONLYDIR) 650 if (mask & IN_ONLYDIR)
699 flags |= LOOKUP_DIRECTORY; 651 flags |= LOOKUP_DIRECTORY;
700 652
701 ret = find_inode(pathname, &path, flags); 653 ret = inotify_find_inode(pathname, &path, flags);
702 if (unlikely(ret)) 654 if (ret)
703 goto fput_and_out; 655 goto fput_and_out;
704 656
705 /* inode held in place by reference to path; dev by fget on fd */ 657 /* inode held in place by reference to path; group by fget on fd */
706 inode = path.dentry->d_inode; 658 inode = path.dentry->d_inode;
707 dev = filp->private_data; 659 group = filp->private_data;
708 660
709 mutex_lock(&dev->up_mutex); 661 /* create/update an inode mark */
710 ret = inotify_find_update_watch(dev->ih, inode, mask); 662 ret = inotify_update_watch(group, inode, mask);
711 if (ret == -ENOENT) 663 if (unlikely(ret))
712 ret = create_watch(dev, inode, mask); 664 goto path_put_and_out;
713 mutex_unlock(&dev->up_mutex);
714 665
666path_put_and_out:
715 path_put(&path); 667 path_put(&path);
716fput_and_out: 668fput_and_out:
717 fput_light(filp, fput_needed); 669 fput_light(filp, fput_needed);
@@ -720,9 +672,10 @@ fput_and_out:
720 672
721SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) 673SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
722{ 674{
675 struct fsnotify_group *group;
676 struct fsnotify_mark_entry *entry;
723 struct file *filp; 677 struct file *filp;
724 struct inotify_device *dev; 678 int ret = 0, fput_needed;
725 int ret, fput_needed;
726 679
727 filp = fget_light(fd, &fput_needed); 680 filp = fget_light(fd, &fput_needed);
728 if (unlikely(!filp)) 681 if (unlikely(!filp))
@@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
734 goto out; 687 goto out;
735 } 688 }
736 689
737 dev = filp->private_data; 690 group = filp->private_data;
738 691
739 /* we free our watch data when we get IN_IGNORED */ 692 spin_lock(&group->inotify_data.idr_lock);
740 ret = inotify_rm_wd(dev->ih, wd); 693 entry = idr_find(&group->inotify_data.idr, wd);
694 if (unlikely(!entry)) {
695 spin_unlock(&group->inotify_data.idr_lock);
696 ret = -EINVAL;
697 goto out;
698 }
699 fsnotify_get_mark(entry);
700 spin_unlock(&group->inotify_data.idr_lock);
701
702 inotify_destroy_mark_entry(entry, group);
703 fsnotify_put_mark(entry);
741 704
742out: 705out:
743 fput_light(filp, fput_needed); 706 fput_light(filp, fput_needed);
@@ -753,9 +716,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
753} 716}
754 717
755static struct file_system_type inotify_fs_type = { 718static struct file_system_type inotify_fs_type = {
756 .name = "inotifyfs", 719 .name = "inotifyfs",
757 .get_sb = inotify_get_sb, 720 .get_sb = inotify_get_sb,
758 .kill_sb = kill_anon_super, 721 .kill_sb = kill_anon_super,
759}; 722};
760 723
761/* 724/*
@@ -775,18 +738,16 @@ static int __init inotify_user_setup(void)
775 if (IS_ERR(inotify_mnt)) 738 if (IS_ERR(inotify_mnt))
776 panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt)); 739 panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
777 740
741 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
742 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
743 inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
744 if (!inotify_ignored_event)
745 panic("unable to allocate the inotify ignored event\n");
746
778 inotify_max_queued_events = 16384; 747 inotify_max_queued_events = 16384;
779 inotify_max_user_instances = 128; 748 inotify_max_user_instances = 128;
780 inotify_max_user_watches = 8192; 749 inotify_max_user_watches = 8192;
781 750
782 watch_cachep = kmem_cache_create("inotify_watch_cache",
783 sizeof(struct inotify_user_watch),
784 0, SLAB_PANIC, NULL);
785 event_cachep = kmem_cache_create("inotify_event_cache",
786 sizeof(struct inotify_kernel_event),
787 0, SLAB_PANIC, NULL);
788
789 return 0; 751 return 0;
790} 752}
791
792module_init(inotify_user_setup); 753module_init(inotify_user_setup);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 000000000000..959b73e756fd
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,411 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19/*
20 * Basic idea behind the notification queue: An fsnotify group (like inotify)
21 * sends the userspace notification about events asyncronously some time after
22 * the event happened. When inotify gets an event it will need to add that
23 * event to the group notify queue. Since a single event might need to be on
24 * multiple group's notification queues we can't add the event directly to each
25 * queue and instead add a small "event_holder" to each queue. This event_holder
26 * has a pointer back to the original event. Since the majority of events are
27 * going to end up on one, and only one, notification queue we embed one
28 * event_holder into each event. This means we have a single allocation instead
29 * of always needing two. If the embedded event_holder is already in use by
30 * another group a new event_holder (from fsnotify_event_holder_cachep) will be
31 * allocated and used.
32 */
33
34#include <linux/fs.h>
35#include <linux/init.h>
36#include <linux/kernel.h>
37#include <linux/list.h>
38#include <linux/module.h>
39#include <linux/mount.h>
40#include <linux/mutex.h>
41#include <linux/namei.h>
42#include <linux/path.h>
43#include <linux/slab.h>
44#include <linux/spinlock.h>
45
46#include <asm/atomic.h>
47
48#include <linux/fsnotify_backend.h>
49#include "fsnotify.h"
50
51static struct kmem_cache *fsnotify_event_cachep;
52static struct kmem_cache *fsnotify_event_holder_cachep;
53/*
54 * This is a magic event we send when the q is too full. Since it doesn't
55 * hold real event information we just keep one system wide and use it any time
56 * it is needed. It's refcnt is set 1 at kernel init time and will never
57 * get set to 0 so it will never get 'freed'
58 */
59static struct fsnotify_event q_overflow_event;
60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
61
62/**
63 * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
64 * Called from fsnotify_move, which is inlined into filesystem modules.
65 */
66u32 fsnotify_get_cookie(void)
67{
68 return atomic_inc_return(&fsnotify_sync_cookie);
69}
70EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
71
72/* return true if the notify queue is empty, false otherwise */
73bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
74{
75 BUG_ON(!mutex_is_locked(&group->notification_mutex));
76 return list_empty(&group->notification_list) ? true : false;
77}
78
79void fsnotify_get_event(struct fsnotify_event *event)
80{
81 atomic_inc(&event->refcnt);
82}
83
84void fsnotify_put_event(struct fsnotify_event *event)
85{
86 if (!event)
87 return;
88
89 if (atomic_dec_and_test(&event->refcnt)) {
90 if (event->data_type == FSNOTIFY_EVENT_PATH)
91 path_put(&event->path);
92
93 BUG_ON(!list_empty(&event->private_data_list));
94
95 kfree(event->file_name);
96 kmem_cache_free(fsnotify_event_cachep, event);
97 }
98}
99
100struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
101{
102 return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
103}
104
105void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
106{
107 kmem_cache_free(fsnotify_event_holder_cachep, holder);
108}
109
110/*
111 * Find the private data that the group previously attached to this event when
112 * the group added the event to the notification queue (fsnotify_add_notify_event)
113 */
114struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
115{
116 struct fsnotify_event_private_data *lpriv;
117 struct fsnotify_event_private_data *priv = NULL;
118
119 assert_spin_locked(&event->lock);
120
121 list_for_each_entry(lpriv, &event->private_data_list, event_list) {
122 if (lpriv->group == group) {
123 priv = lpriv;
124 list_del(&priv->event_list);
125 break;
126 }
127 }
128 return priv;
129}
130
131/*
132 * Check if 2 events contain the same information. We do not compare private data
133 * but at this moment that isn't a problem for any know fsnotify listeners.
134 */
135static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
136{
137 if ((old->mask == new->mask) &&
138 (old->to_tell == new->to_tell) &&
139 (old->data_type == new->data_type)) {
140 switch (old->data_type) {
141 case (FSNOTIFY_EVENT_INODE):
142 if (old->inode == new->inode)
143 return true;
144 break;
145 case (FSNOTIFY_EVENT_PATH):
146 if ((old->path.mnt == new->path.mnt) &&
147 (old->path.dentry == new->path.dentry))
148 return true;
149 case (FSNOTIFY_EVENT_NONE):
150 return true;
151 };
152 }
153 return false;
154}
155
156/*
157 * Add an event to the group notification queue. The group can later pull this
158 * event off the queue to deal with. If the event is successfully added to the
159 * group's notification queue, a reference is taken on event.
160 */
161int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
162 struct fsnotify_event_private_data *priv)
163{
164 struct fsnotify_event_holder *holder = NULL;
165 struct list_head *list = &group->notification_list;
166 struct fsnotify_event_holder *last_holder;
167 struct fsnotify_event *last_event;
168
169 /* easy to tell if priv was attached to the event */
170 INIT_LIST_HEAD(&priv->event_list);
171
172 /*
173 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
174 * Check if we expect to be able to use that holder. If not alloc a new
175 * holder.
176 * For the overflow event it's possible that something will use the in
177 * event holder before we get the lock so we may need to jump back and
178 * alloc a new holder, this can't happen for most events...
179 */
180 if (!list_empty(&event->holder.event_list)) {
181alloc_holder:
182 holder = fsnotify_alloc_event_holder();
183 if (!holder)
184 return -ENOMEM;
185 }
186
187 mutex_lock(&group->notification_mutex);
188
189 if (group->q_len >= group->max_events) {
190 event = &q_overflow_event;
191 /* sorry, no private data on the overflow event */
192 priv = NULL;
193 }
194
195 spin_lock(&event->lock);
196
197 if (list_empty(&event->holder.event_list)) {
198 if (unlikely(holder))
199 fsnotify_destroy_event_holder(holder);
200 holder = &event->holder;
201 } else if (unlikely(!holder)) {
202 /* between the time we checked above and got the lock the in
203 * event holder was used, go back and get a new one */
204 spin_unlock(&event->lock);
205 mutex_unlock(&group->notification_mutex);
206 goto alloc_holder;
207 }
208
209 if (!list_empty(list)) {
210 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
211 last_event = last_holder->event;
212 if (event_compare(last_event, event)) {
213 spin_unlock(&event->lock);
214 mutex_unlock(&group->notification_mutex);
215 if (holder != &event->holder)
216 fsnotify_destroy_event_holder(holder);
217 return -EEXIST;
218 }
219 }
220
221 group->q_len++;
222 holder->event = event;
223
224 fsnotify_get_event(event);
225 list_add_tail(&holder->event_list, list);
226 if (priv)
227 list_add_tail(&priv->event_list, &event->private_data_list);
228 spin_unlock(&event->lock);
229 mutex_unlock(&group->notification_mutex);
230
231 wake_up(&group->notification_waitq);
232 return 0;
233}
234
235/*
236 * Remove and return the first event from the notification list. There is a
237 * reference held on this event since it was on the list. It is the responsibility
238 * of the caller to drop this reference.
239 */
240struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
241{
242 struct fsnotify_event *event;
243 struct fsnotify_event_holder *holder;
244
245 BUG_ON(!mutex_is_locked(&group->notification_mutex));
246
247 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
248
249 event = holder->event;
250
251 spin_lock(&event->lock);
252 holder->event = NULL;
253 list_del_init(&holder->event_list);
254 spin_unlock(&event->lock);
255
256 /* event == holder means we are referenced through the in event holder */
257 if (holder != &event->holder)
258 fsnotify_destroy_event_holder(holder);
259
260 group->q_len--;
261
262 return event;
263}
264
265/*
266 * This will not remove the event, that must be done with fsnotify_remove_notify_event()
267 */
268struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
269{
270 struct fsnotify_event *event;
271 struct fsnotify_event_holder *holder;
272
273 BUG_ON(!mutex_is_locked(&group->notification_mutex));
274
275 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
276 event = holder->event;
277
278 return event;
279}
280
281/*
282 * Called when a group is being torn down to clean up any outstanding
283 * event notifications.
284 */
285void fsnotify_flush_notify(struct fsnotify_group *group)
286{
287 struct fsnotify_event *event;
288 struct fsnotify_event_private_data *priv;
289
290 mutex_lock(&group->notification_mutex);
291 while (!fsnotify_notify_queue_is_empty(group)) {
292 event = fsnotify_remove_notify_event(group);
293 /* if they don't implement free_event_priv they better not have attached any */
294 if (group->ops->free_event_priv) {
295 spin_lock(&event->lock);
296 priv = fsnotify_remove_priv_from_event(group, event);
297 spin_unlock(&event->lock);
298 if (priv)
299 group->ops->free_event_priv(priv);
300 }
301 fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
302 }
303 mutex_unlock(&group->notification_mutex);
304}
305
306static void initialize_event(struct fsnotify_event *event)
307{
308 event->holder.event = NULL;
309 INIT_LIST_HEAD(&event->holder.event_list);
310 atomic_set(&event->refcnt, 1);
311
312 spin_lock_init(&event->lock);
313
314 event->path.dentry = NULL;
315 event->path.mnt = NULL;
316 event->inode = NULL;
317 event->data_type = FSNOTIFY_EVENT_NONE;
318
319 INIT_LIST_HEAD(&event->private_data_list);
320
321 event->to_tell = NULL;
322
323 event->file_name = NULL;
324 event->name_len = 0;
325
326 event->sync_cookie = 0;
327}
328
329/*
330 * fsnotify_create_event - Allocate a new event which will be sent to each
331 * group's handle_event function if the group was interested in this
332 * particular event.
333 *
334 * @to_tell the inode which is supposed to receive the event (sometimes a
335 * parent of the inode to which the event happened.
336 * @mask what actually happened.
337 * @data pointer to the object which was actually affected
338 * @data_type flag indication if the data is a file, path, inode, nothing...
339 * @name the filename, if available
340 */
341struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
342 int data_type, const char *name, u32 cookie)
343{
344 struct fsnotify_event *event;
345
346 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
347 if (!event)
348 return NULL;
349
350 initialize_event(event);
351
352 if (name) {
353 event->file_name = kstrdup(name, GFP_KERNEL);
354 if (!event->file_name) {
355 kmem_cache_free(fsnotify_event_cachep, event);
356 return NULL;
357 }
358 event->name_len = strlen(event->file_name);
359 }
360
361 event->sync_cookie = cookie;
362 event->to_tell = to_tell;
363
364 switch (data_type) {
365 case FSNOTIFY_EVENT_FILE: {
366 struct file *file = data;
367 struct path *path = &file->f_path;
368 event->path.dentry = path->dentry;
369 event->path.mnt = path->mnt;
370 path_get(&event->path);
371 event->data_type = FSNOTIFY_EVENT_PATH;
372 break;
373 }
374 case FSNOTIFY_EVENT_PATH: {
375 struct path *path = data;
376 event->path.dentry = path->dentry;
377 event->path.mnt = path->mnt;
378 path_get(&event->path);
379 event->data_type = FSNOTIFY_EVENT_PATH;
380 break;
381 }
382 case FSNOTIFY_EVENT_INODE:
383 event->inode = data;
384 event->data_type = FSNOTIFY_EVENT_INODE;
385 break;
386 case FSNOTIFY_EVENT_NONE:
387 event->inode = NULL;
388 event->path.dentry = NULL;
389 event->path.mnt = NULL;
390 break;
391 default:
392 BUG();
393 }
394
395 event->mask = mask;
396
397 return event;
398}
399
400__init int fsnotify_notification_init(void)
401{
402 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
403 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
404
405 initialize_event(&q_overflow_event);
406 q_overflow_event.mask = FS_Q_OVERFLOW;
407
408 return 0;
409}
410subsys_initcall(fsnotify_notification_init);
411
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951dcd4a6..abaaa1cbf8de 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/spinlock.h> 27#include <linux/spinlock.h>
28#include <linux/blkdev.h> /* For bdev_hardsect_size(). */ 28#include <linux/blkdev.h> /* For bdev_logical_block_size(). */
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
@@ -443,6 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
443 ntfs_volume *vol = NTFS_SB(sb); 443 ntfs_volume *vol = NTFS_SB(sb);
444 444
445 ntfs_debug("Entering with remount options string: %s", opt); 445 ntfs_debug("Entering with remount options string: %s", opt);
446
447 lock_kernel();
446#ifndef NTFS_RW 448#ifndef NTFS_RW
447 /* For read-only compiled driver, enforce read-only flag. */ 449 /* For read-only compiled driver, enforce read-only flag. */
448 *flags |= MS_RDONLY; 450 *flags |= MS_RDONLY;
@@ -466,15 +468,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
466 if (NVolErrors(vol)) { 468 if (NVolErrors(vol)) {
467 ntfs_error(sb, "Volume has errors and is read-only%s", 469 ntfs_error(sb, "Volume has errors and is read-only%s",
468 es); 470 es);
471 unlock_kernel();
469 return -EROFS; 472 return -EROFS;
470 } 473 }
471 if (vol->vol_flags & VOLUME_IS_DIRTY) { 474 if (vol->vol_flags & VOLUME_IS_DIRTY) {
472 ntfs_error(sb, "Volume is dirty and read-only%s", es); 475 ntfs_error(sb, "Volume is dirty and read-only%s", es);
476 unlock_kernel();
473 return -EROFS; 477 return -EROFS;
474 } 478 }
475 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { 479 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
476 ntfs_error(sb, "Volume has been modified by chkdsk " 480 ntfs_error(sb, "Volume has been modified by chkdsk "
477 "and is read-only%s", es); 481 "and is read-only%s", es);
482 unlock_kernel();
478 return -EROFS; 483 return -EROFS;
479 } 484 }
480 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { 485 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -482,11 +487,13 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
482 "(0x%x) and is read-only%s", 487 "(0x%x) and is read-only%s",
483 (unsigned)le16_to_cpu(vol->vol_flags), 488 (unsigned)le16_to_cpu(vol->vol_flags),
484 es); 489 es);
490 unlock_kernel();
485 return -EROFS; 491 return -EROFS;
486 } 492 }
487 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { 493 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
488 ntfs_error(sb, "Failed to set dirty bit in volume " 494 ntfs_error(sb, "Failed to set dirty bit in volume "
489 "information flags%s", es); 495 "information flags%s", es);
496 unlock_kernel();
490 return -EROFS; 497 return -EROFS;
491 } 498 }
492#if 0 499#if 0
@@ -506,18 +513,21 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
506 ntfs_error(sb, "Failed to empty journal $LogFile%s", 513 ntfs_error(sb, "Failed to empty journal $LogFile%s",
507 es); 514 es);
508 NVolSetErrors(vol); 515 NVolSetErrors(vol);
516 unlock_kernel();
509 return -EROFS; 517 return -EROFS;
510 } 518 }
511 if (!ntfs_mark_quotas_out_of_date(vol)) { 519 if (!ntfs_mark_quotas_out_of_date(vol)) {
512 ntfs_error(sb, "Failed to mark quotas out of date%s", 520 ntfs_error(sb, "Failed to mark quotas out of date%s",
513 es); 521 es);
514 NVolSetErrors(vol); 522 NVolSetErrors(vol);
523 unlock_kernel();
515 return -EROFS; 524 return -EROFS;
516 } 525 }
517 if (!ntfs_stamp_usnjrnl(vol)) { 526 if (!ntfs_stamp_usnjrnl(vol)) {
518 ntfs_error(sb, "Failed to stamp transation log " 527 ntfs_error(sb, "Failed to stamp transation log "
519 "($UsnJrnl)%s", es); 528 "($UsnJrnl)%s", es);
520 NVolSetErrors(vol); 529 NVolSetErrors(vol);
530 unlock_kernel();
521 return -EROFS; 531 return -EROFS;
522 } 532 }
523 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 533 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -533,8 +543,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
533 543
534 // TODO: Deal with *flags. 544 // TODO: Deal with *flags.
535 545
536 if (!parse_options(vol, opt)) 546 if (!parse_options(vol, opt)) {
547 unlock_kernel();
537 return -EINVAL; 548 return -EINVAL;
549 }
550 unlock_kernel();
538 ntfs_debug("Done."); 551 ntfs_debug("Done.");
539 return 0; 552 return 0;
540} 553}
@@ -2246,6 +2259,9 @@ static void ntfs_put_super(struct super_block *sb)
2246 ntfs_volume *vol = NTFS_SB(sb); 2259 ntfs_volume *vol = NTFS_SB(sb);
2247 2260
2248 ntfs_debug("Entering."); 2261 ntfs_debug("Entering.");
2262
2263 lock_kernel();
2264
2249#ifdef NTFS_RW 2265#ifdef NTFS_RW
2250 /* 2266 /*
2251 * Commit all inodes while they are still open in case some of them 2267 * Commit all inodes while they are still open in case some of them
@@ -2373,39 +2389,12 @@ static void ntfs_put_super(struct super_block *sb)
2373 vol->mftmirr_ino = NULL; 2389 vol->mftmirr_ino = NULL;
2374 } 2390 }
2375 /* 2391 /*
2376 * If any dirty inodes are left, throw away all mft data page cache 2392 * We should have no dirty inodes left, due to
2377 * pages to allow a clean umount. This should never happen any more 2393 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
2378 * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as 2394 * the underlying mft records are written out and cleaned.
2379 * the underlying mft records are written out and cleaned. If it does,
2380 * happen anyway, we want to know...
2381 */ 2395 */
2382 ntfs_commit_inode(vol->mft_ino); 2396 ntfs_commit_inode(vol->mft_ino);
2383 write_inode_now(vol->mft_ino, 1); 2397 write_inode_now(vol->mft_ino, 1);
2384 if (sb_has_dirty_inodes(sb)) {
2385 const char *s1, *s2;
2386
2387 mutex_lock(&vol->mft_ino->i_mutex);
2388 truncate_inode_pages(vol->mft_ino->i_mapping, 0);
2389 mutex_unlock(&vol->mft_ino->i_mutex);
2390 write_inode_now(vol->mft_ino, 1);
2391 if (sb_has_dirty_inodes(sb)) {
2392 static const char *_s1 = "inodes";
2393 static const char *_s2 = "";
2394 s1 = _s1;
2395 s2 = _s2;
2396 } else {
2397 static const char *_s1 = "mft pages";
2398 static const char *_s2 = "They have been thrown "
2399 "away. ";
2400 s1 = _s1;
2401 s2 = _s2;
2402 }
2403 ntfs_error(sb, "Dirty %s found at umount time. %sYou should "
2404 "run chkdsk. Please email "
2405 "linux-ntfs-dev@lists.sourceforge.net and say "
2406 "that you saw this message. Thank you.", s1,
2407 s2);
2408 }
2409#endif /* NTFS_RW */ 2398#endif /* NTFS_RW */
2410 2399
2411 iput(vol->mft_ino); 2400 iput(vol->mft_ino);
@@ -2444,7 +2433,8 @@ static void ntfs_put_super(struct super_block *sb)
2444 } 2433 }
2445 sb->s_fs_info = NULL; 2434 sb->s_fs_info = NULL;
2446 kfree(vol); 2435 kfree(vol);
2447 return; 2436
2437 unlock_kernel();
2448} 2438}
2449 2439
2450/** 2440/**
@@ -2785,13 +2775,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2785 goto err_out_now; 2775 goto err_out_now;
2786 2776
2787 /* We support sector sizes up to the PAGE_CACHE_SIZE. */ 2777 /* We support sector sizes up to the PAGE_CACHE_SIZE. */
2788 if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) { 2778 if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
2789 if (!silent) 2779 if (!silent)
2790 ntfs_error(sb, "Device has unsupported sector size " 2780 ntfs_error(sb, "Device has unsupported sector size "
2791 "(%i). The maximum supported sector " 2781 "(%i). The maximum supported sector "
2792 "size on this architecture is %lu " 2782 "size on this architecture is %lu "
2793 "bytes.", 2783 "bytes.",
2794 bdev_hardsect_size(sb->s_bdev), 2784 bdev_logical_block_size(sb->s_bdev),
2795 PAGE_CACHE_SIZE); 2785 PAGE_CACHE_SIZE);
2796 goto err_out_now; 2786 goto err_out_now;
2797 } 2787 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85eceab376..09cc25d04611 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1371 1371
1372 bdevname(reg->hr_bdev, reg->hr_dev_name); 1372 bdevname(reg->hr_bdev, reg->hr_dev_name);
1373 1373
1374 sectsize = bdev_hardsect_size(reg->hr_bdev); 1374 sectsize = bdev_logical_block_size(reg->hr_bdev);
1375 if (sectsize != reg->hr_block_bytes) { 1375 if (sectsize != reg->hr_block_bytes) {
1376 mlog(ML_ERROR, 1376 mlog(ML_ERROR,
1377 "blocksize %u incorrect for device, expected %d", 1377 "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9d37e0..201b40a441fe 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/quotaops.h> 44#include <linux/quotaops.h>
45#include <linux/smp_lock.h>
45 46
46#define MLOG_MASK_PREFIX ML_SUPER 47#define MLOG_MASK_PREFIX ML_SUPER
47#include <cluster/masklog.h> 48#include <cluster/masklog.h>
@@ -126,7 +127,6 @@ static int ocfs2_get_sector(struct super_block *sb,
126 struct buffer_head **bh, 127 struct buffer_head **bh,
127 int block, 128 int block,
128 int sect_size); 129 int sect_size);
129static void ocfs2_write_super(struct super_block *sb);
130static struct inode *ocfs2_alloc_inode(struct super_block *sb); 130static struct inode *ocfs2_alloc_inode(struct super_block *sb);
131static void ocfs2_destroy_inode(struct inode *inode); 131static void ocfs2_destroy_inode(struct inode *inode);
132static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); 132static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
@@ -141,7 +141,6 @@ static const struct super_operations ocfs2_sops = {
141 .clear_inode = ocfs2_clear_inode, 141 .clear_inode = ocfs2_clear_inode,
142 .delete_inode = ocfs2_delete_inode, 142 .delete_inode = ocfs2_delete_inode,
143 .sync_fs = ocfs2_sync_fs, 143 .sync_fs = ocfs2_sync_fs,
144 .write_super = ocfs2_write_super,
145 .put_super = ocfs2_put_super, 144 .put_super = ocfs2_put_super,
146 .remount_fs = ocfs2_remount, 145 .remount_fs = ocfs2_remount,
147 .show_options = ocfs2_show_options, 146 .show_options = ocfs2_show_options,
@@ -365,24 +364,12 @@ static struct file_operations ocfs2_osb_debug_fops = {
365 .llseek = generic_file_llseek, 364 .llseek = generic_file_llseek,
366}; 365};
367 366
368/*
369 * write_super and sync_fs ripped right out of ext3.
370 */
371static void ocfs2_write_super(struct super_block *sb)
372{
373 if (mutex_trylock(&sb->s_lock) != 0)
374 BUG();
375 sb->s_dirt = 0;
376}
377
378static int ocfs2_sync_fs(struct super_block *sb, int wait) 367static int ocfs2_sync_fs(struct super_block *sb, int wait)
379{ 368{
380 int status; 369 int status;
381 tid_t target; 370 tid_t target;
382 struct ocfs2_super *osb = OCFS2_SB(sb); 371 struct ocfs2_super *osb = OCFS2_SB(sb);
383 372
384 sb->s_dirt = 0;
385
386 if (ocfs2_is_hard_readonly(osb)) 373 if (ocfs2_is_hard_readonly(osb))
387 return -EROFS; 374 return -EROFS;
388 375
@@ -595,6 +582,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
595 struct mount_options parsed_options; 582 struct mount_options parsed_options;
596 struct ocfs2_super *osb = OCFS2_SB(sb); 583 struct ocfs2_super *osb = OCFS2_SB(sb);
597 584
585 lock_kernel();
586
598 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { 587 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
599 ret = -EINVAL; 588 ret = -EINVAL;
600 goto out; 589 goto out;
@@ -698,6 +687,7 @@ unlock_osb:
698 ocfs2_set_journal_params(osb); 687 ocfs2_set_journal_params(osb);
699 } 688 }
700out: 689out:
690 unlock_kernel();
701 return ret; 691 return ret;
702} 692}
703 693
@@ -713,7 +703,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
713 *bh = NULL; 703 *bh = NULL;
714 704
715 /* may be > 512 */ 705 /* may be > 512 */
716 *sector_size = bdev_hardsect_size(sb->s_bdev); 706 *sector_size = bdev_logical_block_size(sb->s_bdev);
717 if (*sector_size > OCFS2_MAX_BLOCKSIZE) { 707 if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
718 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", 708 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
719 *sector_size, OCFS2_MAX_BLOCKSIZE); 709 *sector_size, OCFS2_MAX_BLOCKSIZE);
@@ -1550,9 +1540,13 @@ static void ocfs2_put_super(struct super_block *sb)
1550{ 1540{
1551 mlog_entry("(0x%p)\n", sb); 1541 mlog_entry("(0x%p)\n", sb);
1552 1542
1543 lock_kernel();
1544
1553 ocfs2_sync_blockdev(sb); 1545 ocfs2_sync_blockdev(sb);
1554 ocfs2_dismount_volume(sb, 0); 1546 ocfs2_dismount_volume(sb, 0);
1555 1547
1548 unlock_kernel();
1549
1556 mlog_exit_void(); 1550 mlog_exit_void();
1557} 1551}
1558 1552
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 834b2331f6b3..d17e774eaf45 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -11,21 +11,6 @@
11#include <linux/mpage.h> 11#include <linux/mpage.h>
12#include "omfs.h" 12#include "omfs.h"
13 13
14static int omfs_sync_file(struct file *file, struct dentry *dentry,
15 int datasync)
16{
17 struct inode *inode = dentry->d_inode;
18 int err;
19
20 err = sync_mapping_buffers(inode->i_mapping);
21 if (!(inode->i_state & I_DIRTY))
22 return err;
23 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
24 return err;
25 err |= omfs_sync_inode(inode);
26 return err ? -EIO : 0;
27}
28
29static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) 14static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
30{ 15{
31 return (sbi->s_sys_blocksize - offset - 16 return (sbi->s_sys_blocksize - offset -
@@ -344,7 +329,7 @@ struct file_operations omfs_file_operations = {
344 .aio_read = generic_file_aio_read, 329 .aio_read = generic_file_aio_read,
345 .aio_write = generic_file_aio_write, 330 .aio_write = generic_file_aio_write,
346 .mmap = generic_file_mmap, 331 .mmap = generic_file_mmap,
347 .fsync = omfs_sync_file, 332 .fsync = simple_fsync,
348 .splice_read = generic_file_splice_read, 333 .splice_read = generic_file_splice_read,
349}; 334};
350 335
diff --git a/fs/open.c b/fs/open.c
index bdfbf03615a4..7200e23d9258 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
612 612
613 audit_inode(NULL, dentry); 613 audit_inode(NULL, dentry);
614 614
615 err = mnt_want_write(file->f_path.mnt); 615 err = mnt_want_write_file(file);
616 if (err) 616 if (err)
617 goto out_putf; 617 goto out_putf;
618 mutex_lock(&inode->i_mutex); 618 mutex_lock(&inode->i_mutex);
@@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
761 if (!file) 761 if (!file)
762 goto out; 762 goto out;
763 763
764 error = mnt_want_write(file->f_path.mnt); 764 error = mnt_want_write_file(file);
765 if (error) 765 if (error)
766 goto out_fput; 766 goto out_fput;
767 dentry = file->f_path.dentry; 767 dentry = file->f_path.dentry;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 99e33ef40be4..1a9c7878f864 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
219 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 219 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
220} 220}
221 221
222ssize_t part_alignment_offset_show(struct device *dev,
223 struct device_attribute *attr, char *buf)
224{
225 struct hd_struct *p = dev_to_part(dev);
226 return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
227}
228
222ssize_t part_stat_show(struct device *dev, 229ssize_t part_stat_show(struct device *dev,
223 struct device_attribute *attr, char *buf) 230 struct device_attribute *attr, char *buf)
224{ 231{
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
272static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); 279static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
273static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 280static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
274static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 281static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
282static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
275static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 283static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
276#ifdef CONFIG_FAIL_MAKE_REQUEST 284#ifdef CONFIG_FAIL_MAKE_REQUEST
277static struct device_attribute dev_attr_fail = 285static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
282 &dev_attr_partition.attr, 290 &dev_attr_partition.attr,
283 &dev_attr_start.attr, 291 &dev_attr_start.attr,
284 &dev_attr_size.attr, 292 &dev_attr_size.attr,
293 &dev_attr_alignment_offset.attr,
285 &dev_attr_stat.attr, 294 &dev_attr_stat.attr,
286#ifdef CONFIG_FAIL_MAKE_REQUEST 295#ifdef CONFIG_FAIL_MAKE_REQUEST
287 &dev_attr_fail.attr, 296 &dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
383 pdev = part_to_dev(p); 392 pdev = part_to_dev(p);
384 393
385 p->start_sect = start; 394 p->start_sect = start;
395 p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
386 p->nr_sects = len; 396 p->nr_sects = len;
387 p->partno = partno; 397 p->partno = partno;
388 p->policy = get_disk_ro(disk); 398 p->policy = get_disk_ro(disk);
@@ -546,27 +556,49 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
546 556
547 /* add partitions */ 557 /* add partitions */
548 for (p = 1; p < state->limit; p++) { 558 for (p = 1; p < state->limit; p++) {
549 sector_t size = state->parts[p].size; 559 sector_t size, from;
550 sector_t from = state->parts[p].from; 560try_scan:
561 size = state->parts[p].size;
551 if (!size) 562 if (!size)
552 continue; 563 continue;
564
565 from = state->parts[p].from;
553 if (from >= get_capacity(disk)) { 566 if (from >= get_capacity(disk)) {
554 printk(KERN_WARNING 567 printk(KERN_WARNING
555 "%s: p%d ignored, start %llu is behind the end of the disk\n", 568 "%s: p%d ignored, start %llu is behind the end of the disk\n",
556 disk->disk_name, p, (unsigned long long) from); 569 disk->disk_name, p, (unsigned long long) from);
557 continue; 570 continue;
558 } 571 }
572
559 if (from + size > get_capacity(disk)) { 573 if (from + size > get_capacity(disk)) {
560 /* 574 struct block_device_operations *bdops = disk->fops;
561 * we can not ignore partitions of broken tables 575 unsigned long long capacity;
562 * created by for example camera firmware, but we 576
563 * limit them to the end of the disk to avoid
564 * creating invalid block devices
565 */
566 printk(KERN_WARNING 577 printk(KERN_WARNING
567 "%s: p%d size %llu limited to end of disk\n", 578 "%s: p%d size %llu exceeds device capacity, ",
568 disk->disk_name, p, (unsigned long long) size); 579 disk->disk_name, p, (unsigned long long) size);
569 size = get_capacity(disk) - from; 580
581 if (bdops->set_capacity &&
582 (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
583 printk(KERN_CONT "enabling native capacity\n");
584 capacity = bdops->set_capacity(disk, ~0ULL);
585 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
586 if (capacity > get_capacity(disk)) {
587 set_capacity(disk, capacity);
588 check_disk_size_change(disk, bdev);
589 bdev->bd_invalidated = 0;
590 }
591 goto try_scan;
592 } else {
593 /*
594 * we can not ignore partitions of broken tables
595 * created by for example camera firmware, but
596 * we limit them to the end of the disk to avoid
597 * creating invalid block devices
598 */
599 printk(KERN_CONT "limited to end of disk\n");
600 size = get_capacity(disk) - from;
601 }
570 } 602 }
571 part = add_partition(disk, p, from, size, 603 part = add_partition(disk, p, from, size,
572 state->parts[p].flags); 604 state->parts[p].flags);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 46297683cd34..fc71aab08460 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
76 Sector sect; 76 Sector sect;
77 77
78 res = 0; 78 res = 0;
79 blocksize = bdev_hardsect_size(bdev); 79 blocksize = bdev_logical_block_size(bdev);
80 if (blocksize <= 0) 80 if (blocksize <= 0)
81 goto out_exit; 81 goto out_exit;
82 i_size = i_size_read(bdev->bd_inode); 82 i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 796511886f28..0028d2ef0662 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
110 Sector sect; 110 Sector sect;
111 unsigned char *data; 111 unsigned char *data;
112 u32 this_sector, this_size; 112 u32 this_sector, this_size;
113 int sector_size = bdev_hardsect_size(bdev) / 512; 113 int sector_size = bdev_logical_block_size(bdev) / 512;
114 int loopct = 0; /* number of links followed 114 int loopct = 0; /* number of links followed
115 without finding a data partition */ 115 without finding a data partition */
116 int i; 116 int i;
@@ -415,7 +415,7 @@ static struct {
415 415
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
417{ 417{
418 int sector_size = bdev_hardsect_size(bdev) / 512; 418 int sector_size = bdev_logical_block_size(bdev) / 512;
419 Sector sect; 419 Sector sect;
420 unsigned char *data; 420 unsigned char *data;
421 struct partition *p; 421 struct partition *p;
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec45b8d..f7dd21ad85a6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
302 return 0; 302 return 0;
303} 303}
304 304
305/**
306 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
307 * @pipe: the pipe that the buffer belongs to
308 * @buf: the buffer to put a reference to
309 *
310 * Description:
311 * This function releases a reference to @buf.
312 */
313void generic_pipe_buf_release(struct pipe_inode_info *pipe,
314 struct pipe_buffer *buf)
315{
316 page_cache_release(buf->page);
317}
318
305static const struct pipe_buf_operations anon_pipe_buf_ops = { 319static const struct pipe_buf_operations anon_pipe_buf_ops = {
306 .can_merge = 1, 320 .can_merge = 1,
307 .map = generic_pipe_buf_map, 321 .map = generic_pipe_buf_map,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3326bbf9ab95..1539e630c47d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2128,9 +2128,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2128 if (copy_from_user(page, buf, count)) 2128 if (copy_from_user(page, buf, count))
2129 goto out_free; 2129 goto out_free;
2130 2130
2131 /* Guard against adverse ptrace interaction */
2132 length = mutex_lock_interruptible(&task->cred_guard_mutex);
2133 if (length < 0)
2134 goto out_free;
2135
2131 length = security_setprocattr(task, 2136 length = security_setprocattr(task,
2132 (char*)file->f_path.dentry->d_name.name, 2137 (char*)file->f_path.dentry->d_name.name,
2133 (void*)page, count); 2138 (void*)page, count);
2139 mutex_unlock(&task->cred_guard_mutex);
2134out_free: 2140out_free:
2135 free_page((unsigned long) page); 2141 free_page((unsigned long) page);
2136out: 2142out:
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f6db9618a888..753ca37002c8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,3 +92,28 @@ struct pde_opener {
92 struct list_head lh; 92 struct list_head lh;
93}; 93};
94void pde_users_dec(struct proc_dir_entry *pde); 94void pde_users_dec(struct proc_dir_entry *pde);
95
96extern spinlock_t proc_subdir_lock;
97
98struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
99int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
100unsigned long task_vsize(struct mm_struct *);
101int task_statm(struct mm_struct *, int *, int *, int *, int *);
102void task_mem(struct seq_file *, struct mm_struct *);
103
104struct proc_dir_entry *de_get(struct proc_dir_entry *de);
105void de_put(struct proc_dir_entry *de);
106
107extern struct vfsmount *proc_mnt;
108int proc_fill_super(struct super_block *);
109struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
110
111/*
112 * These are generic /proc routines that use the internal
113 * "struct proc_dir_entry" tree to traverse the filesystem.
114 *
115 * The /proc root directory has extended versions to take care
116 * of the /proc/<pid> subdirectories.
117 */
118int proc_readdir(struct file *, void *, filldir_t);
119struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39cf99ee..1afa4dd4cae2 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
12 12
13static int loadavg_proc_show(struct seq_file *m, void *v) 13static int loadavg_proc_show(struct seq_file *m, void *v)
14{ 14{
15 int a, b, c; 15 unsigned long avnrun[3];
16 unsigned long seq;
17 16
18 do { 17 get_avenrun(avnrun, FIXED_1/200, 0);
19 seq = read_seqbegin(&xtime_lock);
20 a = avenrun[0] + (FIXED_1/200);
21 b = avenrun[1] + (FIXED_1/200);
22 c = avenrun[2] + (FIXED_1/200);
23 } while (read_seqretry(&xtime_lock, seq));
24 18
25 seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", 19 seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
26 LOAD_INT(a), LOAD_FRAC(a), 20 LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
27 LOAD_INT(b), LOAD_FRAC(b), 21 LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
28 LOAD_INT(c), LOAD_FRAC(c), 22 LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
29 nr_running(), nr_threads, 23 nr_running(), nr_threads,
30 task_active_pid_ns(current)->last_pid); 24 task_active_pid_ns(current)->last_pid);
31 return 0; 25 return 0;
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de2bba5a3440..fc6c3025befd 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -11,6 +11,7 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <asm/prom.h> 12#include <asm/prom.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include "internal.h"
14 15
15#ifndef HAVE_ARCH_DEVTREE_FIXUPS 16#ifndef HAVE_ARCH_DEVTREE_FIXUPS
16static inline void set_node_proc_entry(struct device_node *np, 17static inline void set_node_proc_entry(struct device_node *np,
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index 502d7fe98bab..e4d408cc5473 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_QNX4FS_FS) += qnx4.o 5obj-$(CONFIG_QNX4FS_FS) += qnx4.o
6 6
7qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o 7qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 8425cf6e9624..e1cd061a25f7 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,14 +13,9 @@
13 * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) . 13 * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
14 */ 14 */
15 15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/qnx4_fs.h>
19#include <linux/stat.h>
20#include <linux/kernel.h>
21#include <linux/string.h>
22#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
23#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include "qnx4.h"
24 19
25#if 0 20#if 0
26int qnx4_new_block(struct super_block *sb) 21int qnx4_new_block(struct super_block *sb)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ea9ffefb48ad..003c68f3238b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,14 +11,9 @@
11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. 11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
12 */ 12 */
13 13
14#include <linux/string.h>
15#include <linux/errno.h>
16#include <linux/fs.h>
17#include <linux/qnx4_fs.h>
18#include <linux/stat.h>
19#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
20#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
21 16#include "qnx4.h"
22 17
23static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) 18static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
24{ 19{
@@ -84,7 +79,7 @@ const struct file_operations qnx4_dir_operations =
84{ 79{
85 .read = generic_read_dir, 80 .read = generic_read_dir,
86 .readdir = qnx4_readdir, 81 .readdir = qnx4_readdir,
87 .fsync = file_fsync, 82 .fsync = simple_fsync,
88}; 83};
89 84
90const struct inode_operations qnx4_dir_inode_operations = 85const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 867f42b02035..09b170ac936c 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -12,8 +12,7 @@
12 * 27-06-1998 by Frank Denis : file overwriting. 12 * 27-06-1998 by Frank Denis : file overwriting.
13 */ 13 */
14 14
15#include <linux/fs.h> 15#include "qnx4.h"
16#include <linux/qnx4_fs.h>
17 16
18/* 17/*
19 * We have mostly NULL's here: the current defaults are ok for 18 * We have mostly NULL's here: the current defaults are ok for
@@ -29,7 +28,7 @@ const struct file_operations qnx4_file_operations =
29#ifdef CONFIG_QNX4FS_RW 28#ifdef CONFIG_QNX4FS_RW
30 .write = do_sync_write, 29 .write = do_sync_write,
31 .aio_write = generic_file_aio_write, 30 .aio_write = generic_file_aio_write,
32 .fsync = qnx4_sync_file, 31 .fsync = simple_fsync,
33#endif 32#endif
34}; 33};
35 34
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
deleted file mode 100644
index aa3b19544bee..000000000000
--- a/fs/qnx4/fsync.c
+++ /dev/null
@@ -1,169 +0,0 @@
1/*
2 * QNX4 file system, Linux implementation.
3 *
4 * Version : 0.1
5 *
6 * Using parts of the xiafs filesystem.
7 *
8 * History :
9 *
10 * 24-03-1998 by Richard Frowijn : first release.
11 */
12
13#include <linux/errno.h>
14#include <linux/time.h>
15#include <linux/stat.h>
16#include <linux/fcntl.h>
17#include <linux/smp_lock.h>
18#include <linux/buffer_head.h>
19
20#include <linux/fs.h>
21#include <linux/qnx4_fs.h>
22
23#include <asm/system.h>
24
25/*
26 * The functions for qnx4 fs file synchronization.
27 */
28
29#ifdef CONFIG_QNX4FS_RW
30
31static int sync_block(struct inode *inode, unsigned short *block, int wait)
32{
33 struct buffer_head *bh;
34 unsigned short tmp;
35
36 if (!*block)
37 return 0;
38 tmp = *block;
39 bh = sb_find_get_block(inode->i_sb, *block);
40 if (!bh)
41 return 0;
42 if (*block != tmp) {
43 brelse(bh);
44 return 1;
45 }
46 if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
47 brelse(bh);
48 return -1;
49 }
50 if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
51 brelse(bh);
52 return 0;
53 }
54 ll_rw_block(WRITE, 1, &bh);
55 atomic_dec(&bh->b_count);
56 return 0;
57}
58
59#ifdef WTF
60static int sync_iblock(struct inode *inode, unsigned short *iblock,
61 struct buffer_head **bh, int wait)
62{
63 int rc;
64 unsigned short tmp;
65
66 *bh = NULL;
67 tmp = *iblock;
68 if (!tmp)
69 return 0;
70 rc = sync_block(inode, iblock, wait);
71 if (rc)
72 return rc;
73 *bh = sb_bread(inode->i_sb, tmp);
74 if (tmp != *iblock) {
75 brelse(*bh);
76 *bh = NULL;
77 return 1;
78 }
79 if (!*bh)
80 return -1;
81 return 0;
82}
83#endif
84
85static int sync_direct(struct inode *inode, int wait)
86{
87 int i;
88 int rc, err = 0;
89
90 for (i = 0; i < 7; i++) {
91 rc = sync_block(inode,
92 (unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait);
93 if (rc > 0)
94 break;
95 if (rc)
96 err = rc;
97 }
98 return err;
99}
100
101#ifdef WTF
102static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait)
103{
104 int i;
105 struct buffer_head *ind_bh;
106 int rc, err = 0;
107
108 rc = sync_iblock(inode, iblock, &ind_bh, wait);
109 if (rc || !ind_bh)
110 return rc;
111
112 for (i = 0; i < 512; i++) {
113 rc = sync_block(inode,
114 ((unsigned short *) ind_bh->b_data) + i,
115 wait);
116 if (rc > 0)
117 break;
118 if (rc)
119 err = rc;
120 }
121 brelse(ind_bh);
122 return err;
123}
124
125static int sync_dindirect(struct inode *inode, unsigned short *diblock,
126 int wait)
127{
128 int i;
129 struct buffer_head *dind_bh;
130 int rc, err = 0;
131
132 rc = sync_iblock(inode, diblock, &dind_bh, wait);
133 if (rc || !dind_bh)
134 return rc;
135
136 for (i = 0; i < 512; i++) {
137 rc = sync_indirect(inode,
138 ((unsigned short *) dind_bh->b_data) + i,
139 wait);
140 if (rc > 0)
141 break;
142 if (rc)
143 err = rc;
144 }
145 brelse(dind_bh);
146 return err;
147}
148#endif
149
150int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused)
151{
152 struct inode *inode = dentry->d_inode;
153 int wait, err = 0;
154
155 (void) file;
156 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
157 S_ISLNK(inode->i_mode)))
158 return -EINVAL;
159
160 lock_kernel();
161 for (wait = 0; wait <= 1; wait++) {
162 err |= sync_direct(inode, wait);
163 }
164 err |= qnx4_sync_inode(inode);
165 unlock_kernel();
166 return (err < 0) ? -EIO : 0;
167}
168
169#endif
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fe1f0f31d11c..681df5fcd161 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -13,19 +13,15 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/slab.h>
20#include <linux/fs.h>
21#include <linux/qnx4_fs.h>
22#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/slab.h>
23#include <linux/highuid.h> 18#include <linux/highuid.h>
24#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
25#include <linux/pagemap.h> 20#include <linux/pagemap.h>
26#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
27#include <linux/vfs.h> 22#include <linux/writeback.h>
28#include <asm/uaccess.h> 23#include <linux/statfs.h>
24#include "qnx4.h"
29 25
30#define QNX4_VERSION 4 26#define QNX4_VERSION 4
31#define QNX4_BMNAME ".bitmap" 27#define QNX4_BMNAME ".bitmap"
@@ -34,31 +30,6 @@ static const struct super_operations qnx4_sops;
34 30
35#ifdef CONFIG_QNX4FS_RW 31#ifdef CONFIG_QNX4FS_RW
36 32
37int qnx4_sync_inode(struct inode *inode)
38{
39 int err = 0;
40# if 0
41 struct buffer_head *bh;
42
43 bh = qnx4_update_inode(inode);
44 if (bh && buffer_dirty(bh))
45 {
46 sync_dirty_buffer(bh);
47 if (buffer_req(bh) && !buffer_uptodate(bh))
48 {
49 printk ("IO error syncing qnx4 inode [%s:%08lx]\n",
50 inode->i_sb->s_id, inode->i_ino);
51 err = -1;
52 }
53 brelse (bh);
54 } else if (!bh) {
55 err = -1;
56 }
57# endif
58
59 return err;
60}
61
62static void qnx4_delete_inode(struct inode *inode) 33static void qnx4_delete_inode(struct inode *inode)
63{ 34{
64 QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino)); 35 QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
@@ -70,15 +41,7 @@ static void qnx4_delete_inode(struct inode *inode)
70 unlock_kernel(); 41 unlock_kernel();
71} 42}
72 43
73static void qnx4_write_super(struct super_block *sb) 44static int qnx4_write_inode(struct inode *inode, int do_sync)
74{
75 lock_kernel();
76 QNX4DEBUG(("qnx4: write_super\n"));
77 sb->s_dirt = 0;
78 unlock_kernel();
79}
80
81static int qnx4_write_inode(struct inode *inode, int unused)
82{ 45{
83 struct qnx4_inode_entry *raw_inode; 46 struct qnx4_inode_entry *raw_inode;
84 int block, ino; 47 int block, ino;
@@ -115,6 +78,16 @@ static int qnx4_write_inode(struct inode *inode, int unused)
115 raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 78 raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
116 raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks); 79 raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
117 mark_buffer_dirty(bh); 80 mark_buffer_dirty(bh);
81 if (do_sync) {
82 sync_dirty_buffer(bh);
83 if (buffer_req(bh) && !buffer_uptodate(bh)) {
84 printk("qnx4: IO error syncing inode [%s:%08x]\n",
85 inode->i_sb->s_id, ino);
86 brelse(bh);
87 unlock_kernel();
88 return -EIO;
89 }
90 }
118 brelse(bh); 91 brelse(bh);
119 unlock_kernel(); 92 unlock_kernel();
120 return 0; 93 return 0;
@@ -138,7 +111,6 @@ static const struct super_operations qnx4_sops =
138#ifdef CONFIG_QNX4FS_RW 111#ifdef CONFIG_QNX4FS_RW
139 .write_inode = qnx4_write_inode, 112 .write_inode = qnx4_write_inode,
140 .delete_inode = qnx4_delete_inode, 113 .delete_inode = qnx4_delete_inode,
141 .write_super = qnx4_write_super,
142#endif 114#endif
143}; 115};
144 116
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 775eed3a4085..5972ed214937 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,16 +12,9 @@
12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink. 12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
13 */ 13 */
14 14
15#include <linux/time.h>
16#include <linux/fs.h>
17#include <linux/qnx4_fs.h>
18#include <linux/kernel.h>
19#include <linux/string.h>
20#include <linux/stat.h>
21#include <linux/fcntl.h>
22#include <linux/errno.h>
23#include <linux/smp_lock.h> 15#include <linux/smp_lock.h>
24#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include "qnx4.h"
25 18
26 19
27/* 20/*
@@ -187,7 +180,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
187 de->di_status = 0; 180 de->di_status = 0;
188 memset(de->di_fname, 0, sizeof de->di_fname); 181 memset(de->di_fname, 0, sizeof de->di_fname);
189 de->di_mode = 0; 182 de->di_mode = 0;
190 mark_buffer_dirty(bh); 183 mark_buffer_dirty_inode(bh, dir);
191 clear_nlink(inode); 184 clear_nlink(inode);
192 mark_inode_dirty(inode); 185 mark_inode_dirty(inode);
193 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; 186 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -232,7 +225,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
232 de->di_status = 0; 225 de->di_status = 0;
233 memset(de->di_fname, 0, sizeof de->di_fname); 226 memset(de->di_fname, 0, sizeof de->di_fname);
234 de->di_mode = 0; 227 de->di_mode = 0;
235 mark_buffer_dirty(bh); 228 mark_buffer_dirty_inode(bh, dir);
236 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; 229 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
237 mark_inode_dirty(dir); 230 mark_inode_dirty(dir);
238 inode->i_ctime = dir->i_ctime; 231 inode->i_ctime = dir->i_ctime;
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
new file mode 100644
index 000000000000..9efc089454f6
--- /dev/null
+++ b/fs/qnx4/qnx4.h
@@ -0,0 +1,57 @@
1#include <linux/fs.h>
2#include <linux/qnx4_fs.h>
3
4#define QNX4_DEBUG 0
5
6#if QNX4_DEBUG
7#define QNX4DEBUG(X) printk X
8#else
9#define QNX4DEBUG(X) (void) 0
10#endif
11
12struct qnx4_sb_info {
13 struct buffer_head *sb_buf; /* superblock buffer */
14 struct qnx4_super_block *sb; /* our superblock */
15 unsigned int Version; /* may be useful */
16 struct qnx4_inode_entry *BitMap; /* useful */
17};
18
19struct qnx4_inode_info {
20 struct qnx4_inode_entry raw;
21 loff_t mmu_private;
22 struct inode vfs_inode;
23};
24
25extern struct inode *qnx4_iget(struct super_block *, unsigned long);
26extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
27extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
28extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
29
30extern struct buffer_head *qnx4_bread(struct inode *, int, int);
31
32extern const struct inode_operations qnx4_file_inode_operations;
33extern const struct inode_operations qnx4_dir_inode_operations;
34extern const struct file_operations qnx4_file_operations;
35extern const struct file_operations qnx4_dir_operations;
36extern int qnx4_is_free(struct super_block *sb, long block);
37extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
38extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
39extern void qnx4_truncate(struct inode *inode);
40extern void qnx4_free_inode(struct inode *inode);
41extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
42extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
43
44static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
45{
46 return sb->s_fs_info;
47}
48
49static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
50{
51 return container_of(inode, struct qnx4_inode_info, vfs_inode);
52}
53
54static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
55{
56 return &qnx4_i(inode)->raw;
57}
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 6437c1c3d1dd..d94d9ee241fe 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,12 +10,8 @@
10 * 30-06-1998 by Frank DENIS : ugly filler. 10 * 30-06-1998 by Frank DENIS : ugly filler.
11 */ 11 */
12 12
13#include <linux/types.h>
14#include <linux/errno.h>
15#include <linux/fs.h>
16#include <linux/qnx4_fs.h>
17#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
18#include <asm/uaccess.h> 14#include "qnx4.h"
19 15
20#ifdef CONFIG_QNX4FS_RW 16#ifdef CONFIG_QNX4FS_RW
21 17
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b7f5a468f076..95c5b42384b2 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -159,10 +159,14 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
159 return error; 159 return error;
160} 160}
161 161
162static void quota_sync_sb(struct super_block *sb, int type) 162#ifdef CONFIG_QUOTA
163void sync_quota_sb(struct super_block *sb, int type)
163{ 164{
164 int cnt; 165 int cnt;
165 166
167 if (!sb->s_qcop->quota_sync)
168 return;
169
166 sb->s_qcop->quota_sync(sb, type); 170 sb->s_qcop->quota_sync(sb, type);
167 171
168 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) 172 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
@@ -191,17 +195,13 @@ static void quota_sync_sb(struct super_block *sb, int type)
191 } 195 }
192 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 196 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
193} 197}
198#endif
194 199
195void sync_dquots(struct super_block *sb, int type) 200static void sync_dquots(int type)
196{ 201{
202 struct super_block *sb;
197 int cnt; 203 int cnt;
198 204
199 if (sb) {
200 if (sb->s_qcop->quota_sync)
201 quota_sync_sb(sb, type);
202 return;
203 }
204
205 spin_lock(&sb_lock); 205 spin_lock(&sb_lock);
206restart: 206restart:
207 list_for_each_entry(sb, &super_blocks, s_list) { 207 list_for_each_entry(sb, &super_blocks, s_list) {
@@ -222,8 +222,8 @@ restart:
222 sb->s_count++; 222 sb->s_count++;
223 spin_unlock(&sb_lock); 223 spin_unlock(&sb_lock);
224 down_read(&sb->s_umount); 224 down_read(&sb->s_umount);
225 if (sb->s_root && sb->s_qcop->quota_sync) 225 if (sb->s_root)
226 quota_sync_sb(sb, type); 226 sync_quota_sb(sb, type);
227 up_read(&sb->s_umount); 227 up_read(&sb->s_umount);
228 spin_lock(&sb_lock); 228 spin_lock(&sb_lock);
229 if (__put_super_and_need_restart(sb)) 229 if (__put_super_and_need_restart(sb))
@@ -301,7 +301,10 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
301 return sb->s_qcop->set_dqblk(sb, type, id, &idq); 301 return sb->s_qcop->set_dqblk(sb, type, id, &idq);
302 } 302 }
303 case Q_SYNC: 303 case Q_SYNC:
304 sync_dquots(sb, type); 304 if (sb)
305 sync_quota_sb(sb, type);
306 else
307 sync_dquots(type);
305 return 0; 308 return 0;
306 309
307 case Q_XQUOTAON: 310 case Q_XQUOTAON:
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76bb9ee1..6c8c55dec2bc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
805 goto out; 805 goto out;
806 if (!(in_file->f_mode & FMODE_READ)) 806 if (!(in_file->f_mode & FMODE_READ))
807 goto fput_in; 807 goto fput_in;
808 retval = -EINVAL;
809 in_inode = in_file->f_path.dentry->d_inode;
810 if (!in_inode)
811 goto fput_in;
812 if (!in_file->f_op || !in_file->f_op->splice_read)
813 goto fput_in;
814 retval = -ESPIPE; 808 retval = -ESPIPE;
815 if (!ppos) 809 if (!ppos)
816 ppos = &in_file->f_pos; 810 ppos = &in_file->f_pos;
@@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
834 retval = -EINVAL; 828 retval = -EINVAL;
835 if (!out_file->f_op || !out_file->f_op->sendpage) 829 if (!out_file->f_op || !out_file->f_op->sendpage)
836 goto fput_out; 830 goto fput_out;
831 in_inode = in_file->f_path.dentry->d_inode;
837 out_inode = out_file->f_path.dentry->d_inode; 832 out_inode = out_file->f_path.dentry->d_inode;
838 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); 833 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
839 if (retval < 0) 834 if (retval < 0)
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 45ee3d357c70..6d2668fdc384 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -44,13 +44,11 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
44static inline bool is_privroot_deh(struct dentry *dir, 44static inline bool is_privroot_deh(struct dentry *dir,
45 struct reiserfs_de_head *deh) 45 struct reiserfs_de_head *deh)
46{ 46{
47 int ret = 0;
48#ifdef CONFIG_REISERFS_FS_XATTR
49 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; 47 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
50 ret = (dir == dir->d_parent && privroot->d_inode && 48 if (reiserfs_expose_privroot(dir->d_sb))
51 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); 49 return 0;
52#endif 50 return (dir == dir->d_parent && privroot->d_inode &&
53 return ret; 51 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
54} 52}
55 53
56int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, 54int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3567fb9e3fb1..2969773cfc22 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,6 +28,7 @@
28#include <linux/mount.h> 28#include <linux/mount.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/crc32.h> 30#include <linux/crc32.h>
31#include <linux/smp_lock.h>
31 32
32struct file_system_type reiserfs_fs_type; 33struct file_system_type reiserfs_fs_type;
33 34
@@ -64,18 +65,15 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
64 65
65static int reiserfs_sync_fs(struct super_block *s, int wait) 66static int reiserfs_sync_fs(struct super_block *s, int wait)
66{ 67{
67 if (!(s->s_flags & MS_RDONLY)) { 68 struct reiserfs_transaction_handle th;
68 struct reiserfs_transaction_handle th; 69
69 reiserfs_write_lock(s); 70 reiserfs_write_lock(s);
70 if (!journal_begin(&th, s, 1)) 71 if (!journal_begin(&th, s, 1))
71 if (!journal_end_sync(&th, s, 1)) 72 if (!journal_end_sync(&th, s, 1))
72 reiserfs_flush_old_commits(s); 73 reiserfs_flush_old_commits(s);
73 s->s_dirt = 0; /* Even if it's not true. 74 s->s_dirt = 0; /* Even if it's not true.
74 * We'll loop forever in sync_supers otherwise */ 75 * We'll loop forever in sync_supers otherwise */
75 reiserfs_write_unlock(s); 76 reiserfs_write_unlock(s);
76 } else {
77 s->s_dirt = 0;
78 }
79 return 0; 77 return 0;
80} 78}
81 79
@@ -468,6 +466,11 @@ static void reiserfs_put_super(struct super_block *s)
468 struct reiserfs_transaction_handle th; 466 struct reiserfs_transaction_handle th;
469 th.t_trans_id = 0; 467 th.t_trans_id = 0;
470 468
469 lock_kernel();
470
471 if (s->s_dirt)
472 reiserfs_write_super(s);
473
471 /* change file system state to current state if it was mounted with read-write permissions */ 474 /* change file system state to current state if it was mounted with read-write permissions */
472 if (!(s->s_flags & MS_RDONLY)) { 475 if (!(s->s_flags & MS_RDONLY)) {
473 if (!journal_begin(&th, s, 10)) { 476 if (!journal_begin(&th, s, 10)) {
@@ -500,7 +503,7 @@ static void reiserfs_put_super(struct super_block *s)
500 kfree(s->s_fs_info); 503 kfree(s->s_fs_info);
501 s->s_fs_info = NULL; 504 s->s_fs_info = NULL;
502 505
503 return; 506 unlock_kernel();
504} 507}
505 508
506static struct kmem_cache *reiserfs_inode_cachep; 509static struct kmem_cache *reiserfs_inode_cachep;
@@ -898,6 +901,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
898 {"conv",.setmask = 1 << REISERFS_CONVERT}, 901 {"conv",.setmask = 1 << REISERFS_CONVERT},
899 {"attrs",.setmask = 1 << REISERFS_ATTRS}, 902 {"attrs",.setmask = 1 << REISERFS_ATTRS},
900 {"noattrs",.clrmask = 1 << REISERFS_ATTRS}, 903 {"noattrs",.clrmask = 1 << REISERFS_ATTRS},
904 {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
901#ifdef CONFIG_REISERFS_FS_XATTR 905#ifdef CONFIG_REISERFS_FS_XATTR
902 {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER}, 906 {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
903 {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER}, 907 {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
@@ -1193,6 +1197,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1193 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); 1197 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
1194#endif 1198#endif
1195 1199
1200 lock_kernel();
1196 rs = SB_DISK_SUPER_BLOCK(s); 1201 rs = SB_DISK_SUPER_BLOCK(s);
1197 1202
1198 if (!reiserfs_parse_options 1203 if (!reiserfs_parse_options
@@ -1315,10 +1320,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1315 1320
1316out_ok: 1321out_ok:
1317 replace_mount_options(s, new_opts); 1322 replace_mount_options(s, new_opts);
1323 unlock_kernel();
1318 return 0; 1324 return 0;
1319 1325
1320out_err: 1326out_err:
1321 kfree(new_opts); 1327 kfree(new_opts);
1328 unlock_kernel();
1322 return err; 1329 return err;
1323} 1330}
1324 1331
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8e7deb0e6964..f3d47d856848 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -981,7 +981,8 @@ int reiserfs_lookup_privroot(struct super_block *s)
981 strlen(PRIVROOT_NAME)); 981 strlen(PRIVROOT_NAME));
982 if (!IS_ERR(dentry)) { 982 if (!IS_ERR(dentry)) {
983 REISERFS_SB(s)->priv_root = dentry; 983 REISERFS_SB(s)->priv_root = dentry;
984 s->s_root->d_op = &xattr_lookup_poison_ops; 984 if (!reiserfs_expose_privroot(s))
985 s->s_root->d_op = &xattr_lookup_poison_ops;
985 if (dentry->d_inode) 986 if (dentry->d_inode)
986 dentry->d_inode->i_flags |= S_PRIVATE; 987 dentry->d_inode->i_flags |= S_PRIVATE;
987 } else 988 } else
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fc27fbfc5397..1402d2d54f52 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -474,6 +474,8 @@ smb_put_super(struct super_block *sb)
474{ 474{
475 struct smb_sb_info *server = SMB_SB(sb); 475 struct smb_sb_info *server = SMB_SB(sb);
476 476
477 lock_kernel();
478
477 smb_lock_server(server); 479 smb_lock_server(server);
478 server->state = CONN_INVALID; 480 server->state = CONN_INVALID;
479 smbiod_unregister_server(server); 481 smbiod_unregister_server(server);
@@ -489,6 +491,8 @@ smb_put_super(struct super_block *sb)
489 smb_unlock_server(server); 491 smb_unlock_server(server);
490 put_pid(server->conn_pid); 492 put_pid(server->conn_pid);
491 kfree(server); 493 kfree(server);
494
495 unlock_kernel();
492} 496}
493 497
494static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) 498static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
diff --git a/fs/splice.c b/fs/splice.c
index 666953d59a35..73766d24f97b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -507,9 +507,131 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
507 507
508 return ret; 508 return ret;
509} 509}
510
511EXPORT_SYMBOL(generic_file_splice_read); 510EXPORT_SYMBOL(generic_file_splice_read);
512 511
512static const struct pipe_buf_operations default_pipe_buf_ops = {
513 .can_merge = 0,
514 .map = generic_pipe_buf_map,
515 .unmap = generic_pipe_buf_unmap,
516 .confirm = generic_pipe_buf_confirm,
517 .release = generic_pipe_buf_release,
518 .steal = generic_pipe_buf_steal,
519 .get = generic_pipe_buf_get,
520};
521
522static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
523 unsigned long vlen, loff_t offset)
524{
525 mm_segment_t old_fs;
526 loff_t pos = offset;
527 ssize_t res;
528
529 old_fs = get_fs();
530 set_fs(get_ds());
531 /* The cast to a user pointer is valid due to the set_fs() */
532 res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
533 set_fs(old_fs);
534
535 return res;
536}
537
538static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
539 loff_t pos)
540{
541 mm_segment_t old_fs;
542 ssize_t res;
543
544 old_fs = get_fs();
545 set_fs(get_ds());
546 /* The cast to a user pointer is valid due to the set_fs() */
547 res = vfs_write(file, (const char __user *)buf, count, &pos);
548 set_fs(old_fs);
549
550 return res;
551}
552
553ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
554 struct pipe_inode_info *pipe, size_t len,
555 unsigned int flags)
556{
557 unsigned int nr_pages;
558 unsigned int nr_freed;
559 size_t offset;
560 struct page *pages[PIPE_BUFFERS];
561 struct partial_page partial[PIPE_BUFFERS];
562 struct iovec vec[PIPE_BUFFERS];
563 pgoff_t index;
564 ssize_t res;
565 size_t this_len;
566 int error;
567 int i;
568 struct splice_pipe_desc spd = {
569 .pages = pages,
570 .partial = partial,
571 .flags = flags,
572 .ops = &default_pipe_buf_ops,
573 .spd_release = spd_release_page,
574 };
575
576 index = *ppos >> PAGE_CACHE_SHIFT;
577 offset = *ppos & ~PAGE_CACHE_MASK;
578 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
579
580 for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
581 struct page *page;
582
583 page = alloc_page(GFP_USER);
584 error = -ENOMEM;
585 if (!page)
586 goto err;
587
588 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
589 vec[i].iov_base = (void __user *) page_address(page);
590 vec[i].iov_len = this_len;
591 pages[i] = page;
592 spd.nr_pages++;
593 len -= this_len;
594 offset = 0;
595 }
596
597 res = kernel_readv(in, vec, spd.nr_pages, *ppos);
598 if (res < 0) {
599 error = res;
600 goto err;
601 }
602
603 error = 0;
604 if (!res)
605 goto err;
606
607 nr_freed = 0;
608 for (i = 0; i < spd.nr_pages; i++) {
609 this_len = min_t(size_t, vec[i].iov_len, res);
610 partial[i].offset = 0;
611 partial[i].len = this_len;
612 if (!this_len) {
613 __free_page(pages[i]);
614 pages[i] = NULL;
615 nr_freed++;
616 }
617 res -= this_len;
618 }
619 spd.nr_pages -= nr_freed;
620
621 res = splice_to_pipe(pipe, &spd);
622 if (res > 0)
623 *ppos += res;
624
625 return res;
626
627err:
628 for (i = 0; i < spd.nr_pages; i++)
629 __free_page(pages[i]);
630
631 return error;
632}
633EXPORT_SYMBOL(default_file_splice_read);
634
513/* 635/*
514 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 636 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
515 * using sendpage(). Return the number of bytes sent. 637 * using sendpage(). Return the number of bytes sent.
@@ -881,6 +1003,36 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
881 1003
882EXPORT_SYMBOL(generic_file_splice_write); 1004EXPORT_SYMBOL(generic_file_splice_write);
883 1005
1006static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1007 struct splice_desc *sd)
1008{
1009 int ret;
1010 void *data;
1011
1012 ret = buf->ops->confirm(pipe, buf);
1013 if (ret)
1014 return ret;
1015
1016 data = buf->ops->map(pipe, buf, 0);
1017 ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1018 buf->ops->unmap(pipe, buf, data);
1019
1020 return ret;
1021}
1022
1023static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
1024 struct file *out, loff_t *ppos,
1025 size_t len, unsigned int flags)
1026{
1027 ssize_t ret;
1028
1029 ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
1030 if (ret > 0)
1031 *ppos += ret;
1032
1033 return ret;
1034}
1035
884/** 1036/**
885 * generic_splice_sendpage - splice data from a pipe to a socket 1037 * generic_splice_sendpage - splice data from a pipe to a socket
886 * @pipe: pipe to splice from 1038 * @pipe: pipe to splice from
@@ -908,11 +1060,10 @@ EXPORT_SYMBOL(generic_splice_sendpage);
908static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 1060static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
909 loff_t *ppos, size_t len, unsigned int flags) 1061 loff_t *ppos, size_t len, unsigned int flags)
910{ 1062{
1063 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1064 loff_t *, size_t, unsigned int);
911 int ret; 1065 int ret;
912 1066
913 if (unlikely(!out->f_op || !out->f_op->splice_write))
914 return -EINVAL;
915
916 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1067 if (unlikely(!(out->f_mode & FMODE_WRITE)))
917 return -EBADF; 1068 return -EBADF;
918 1069
@@ -923,7 +1074,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
923 if (unlikely(ret < 0)) 1074 if (unlikely(ret < 0))
924 return ret; 1075 return ret;
925 1076
926 return out->f_op->splice_write(pipe, out, ppos, len, flags); 1077 splice_write = out->f_op->splice_write;
1078 if (!splice_write)
1079 splice_write = default_file_splice_write;
1080
1081 return splice_write(pipe, out, ppos, len, flags);
927} 1082}
928 1083
929/* 1084/*
@@ -933,11 +1088,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
933 struct pipe_inode_info *pipe, size_t len, 1088 struct pipe_inode_info *pipe, size_t len,
934 unsigned int flags) 1089 unsigned int flags)
935{ 1090{
1091 ssize_t (*splice_read)(struct file *, loff_t *,
1092 struct pipe_inode_info *, size_t, unsigned int);
936 int ret; 1093 int ret;
937 1094
938 if (unlikely(!in->f_op || !in->f_op->splice_read))
939 return -EINVAL;
940
941 if (unlikely(!(in->f_mode & FMODE_READ))) 1095 if (unlikely(!(in->f_mode & FMODE_READ)))
942 return -EBADF; 1096 return -EBADF;
943 1097
@@ -945,7 +1099,11 @@ static long do_splice_to(struct file *in, loff_t *ppos,
945 if (unlikely(ret < 0)) 1099 if (unlikely(ret < 0))
946 return ret; 1100 return ret;
947 1101
948 return in->f_op->splice_read(in, ppos, pipe, len, flags); 1102 splice_read = in->f_op->splice_read;
1103 if (!splice_read)
1104 splice_read = default_file_splice_read;
1105
1106 return splice_read(in, ppos, pipe, len, flags);
949} 1107}
950 1108
951/** 1109/**
@@ -1112,6 +1270,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1112 return ret; 1270 return ret;
1113} 1271}
1114 1272
1273static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1274 struct pipe_inode_info *opipe,
1275 size_t len, unsigned int flags);
1115/* 1276/*
1116 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1277 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1117 * location, so checking ->i_pipe is not enough to verify that this is a 1278 * location, so checking ->i_pipe is not enough to verify that this is a
@@ -1132,12 +1293,32 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1132 struct file *out, loff_t __user *off_out, 1293 struct file *out, loff_t __user *off_out,
1133 size_t len, unsigned int flags) 1294 size_t len, unsigned int flags)
1134{ 1295{
1135 struct pipe_inode_info *pipe; 1296 struct pipe_inode_info *ipipe;
1297 struct pipe_inode_info *opipe;
1136 loff_t offset, *off; 1298 loff_t offset, *off;
1137 long ret; 1299 long ret;
1138 1300
1139 pipe = pipe_info(in->f_path.dentry->d_inode); 1301 ipipe = pipe_info(in->f_path.dentry->d_inode);
1140 if (pipe) { 1302 opipe = pipe_info(out->f_path.dentry->d_inode);
1303
1304 if (ipipe && opipe) {
1305 if (off_in || off_out)
1306 return -ESPIPE;
1307
1308 if (!(in->f_mode & FMODE_READ))
1309 return -EBADF;
1310
1311 if (!(out->f_mode & FMODE_WRITE))
1312 return -EBADF;
1313
1314 /* Splicing to self would be fun, but... */
1315 if (ipipe == opipe)
1316 return -EINVAL;
1317
1318 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1319 }
1320
1321 if (ipipe) {
1141 if (off_in) 1322 if (off_in)
1142 return -ESPIPE; 1323 return -ESPIPE;
1143 if (off_out) { 1324 if (off_out) {
@@ -1149,7 +1330,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1149 } else 1330 } else
1150 off = &out->f_pos; 1331 off = &out->f_pos;
1151 1332
1152 ret = do_splice_from(pipe, out, off, len, flags); 1333 ret = do_splice_from(ipipe, out, off, len, flags);
1153 1334
1154 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1335 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1155 ret = -EFAULT; 1336 ret = -EFAULT;
@@ -1157,8 +1338,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1157 return ret; 1338 return ret;
1158 } 1339 }
1159 1340
1160 pipe = pipe_info(out->f_path.dentry->d_inode); 1341 if (opipe) {
1161 if (pipe) {
1162 if (off_out) 1342 if (off_out)
1163 return -ESPIPE; 1343 return -ESPIPE;
1164 if (off_in) { 1344 if (off_in) {
@@ -1170,7 +1350,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1170 } else 1350 } else
1171 off = &in->f_pos; 1351 off = &in->f_pos;
1172 1352
1173 ret = do_splice_to(in, off, pipe, len, flags); 1353 ret = do_splice_to(in, off, opipe, len, flags);
1174 1354
1175 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1355 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1176 ret = -EFAULT; 1356 ret = -EFAULT;
@@ -1511,7 +1691,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1511 * Make sure there's data to read. Wait for input if we can, otherwise 1691 * Make sure there's data to read. Wait for input if we can, otherwise
1512 * return an appropriate error. 1692 * return an appropriate error.
1513 */ 1693 */
1514static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1694static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1515{ 1695{
1516 int ret; 1696 int ret;
1517 1697
@@ -1549,7 +1729,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1549 * Make sure there's writeable room. Wait for room if we can, otherwise 1729 * Make sure there's writeable room. Wait for room if we can, otherwise
1550 * return an appropriate error. 1730 * return an appropriate error.
1551 */ 1731 */
1552static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1732static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1553{ 1733{
1554 int ret; 1734 int ret;
1555 1735
@@ -1587,6 +1767,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1587} 1767}
1588 1768
1589/* 1769/*
1770 * Splice contents of ipipe to opipe.
1771 */
1772static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1773 struct pipe_inode_info *opipe,
1774 size_t len, unsigned int flags)
1775{
1776 struct pipe_buffer *ibuf, *obuf;
1777 int ret = 0, nbuf;
1778 bool input_wakeup = false;
1779
1780
1781retry:
1782 ret = ipipe_prep(ipipe, flags);
1783 if (ret)
1784 return ret;
1785
1786 ret = opipe_prep(opipe, flags);
1787 if (ret)
1788 return ret;
1789
1790 /*
1791 * Potential ABBA deadlock, work around it by ordering lock
1792 * grabbing by pipe info address. Otherwise two different processes
1793 * could deadlock (one doing tee from A -> B, the other from B -> A).
1794 */
1795 pipe_double_lock(ipipe, opipe);
1796
1797 do {
1798 if (!opipe->readers) {
1799 send_sig(SIGPIPE, current, 0);
1800 if (!ret)
1801 ret = -EPIPE;
1802 break;
1803 }
1804
1805 if (!ipipe->nrbufs && !ipipe->writers)
1806 break;
1807
1808 /*
1809 * Cannot make any progress, because either the input
1810 * pipe is empty or the output pipe is full.
1811 */
1812 if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
1813 /* Already processed some buffers, break */
1814 if (ret)
1815 break;
1816
1817 if (flags & SPLICE_F_NONBLOCK) {
1818 ret = -EAGAIN;
1819 break;
1820 }
1821
1822 /*
1823 * We raced with another reader/writer and haven't
1824 * managed to process any buffers. A zero return
1825 * value means EOF, so retry instead.
1826 */
1827 pipe_unlock(ipipe);
1828 pipe_unlock(opipe);
1829 goto retry;
1830 }
1831
1832 ibuf = ipipe->bufs + ipipe->curbuf;
1833 nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
1834 obuf = opipe->bufs + nbuf;
1835
1836 if (len >= ibuf->len) {
1837 /*
1838 * Simply move the whole buffer from ipipe to opipe
1839 */
1840 *obuf = *ibuf;
1841 ibuf->ops = NULL;
1842 opipe->nrbufs++;
1843 ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
1844 ipipe->nrbufs--;
1845 input_wakeup = true;
1846 } else {
1847 /*
1848 * Get a reference to this pipe buffer,
1849 * so we can copy the contents over.
1850 */
1851 ibuf->ops->get(ipipe, ibuf);
1852 *obuf = *ibuf;
1853
1854 /*
1855 * Don't inherit the gift flag, we need to
1856 * prevent multiple steals of this page.
1857 */
1858 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1859
1860 obuf->len = len;
1861 opipe->nrbufs++;
1862 ibuf->offset += obuf->len;
1863 ibuf->len -= obuf->len;
1864 }
1865 ret += obuf->len;
1866 len -= obuf->len;
1867 } while (len);
1868
1869 pipe_unlock(ipipe);
1870 pipe_unlock(opipe);
1871
1872 /*
1873 * If we put data in the output pipe, wakeup any potential readers.
1874 */
1875 if (ret > 0) {
1876 smp_mb();
1877 if (waitqueue_active(&opipe->wait))
1878 wake_up_interruptible(&opipe->wait);
1879 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1880 }
1881 if (input_wakeup)
1882 wakeup_pipe_writers(ipipe);
1883
1884 return ret;
1885}
1886
1887/*
1590 * Link contents of ipipe to opipe. 1888 * Link contents of ipipe to opipe.
1591 */ 1889 */
1592static int link_pipe(struct pipe_inode_info *ipipe, 1890static int link_pipe(struct pipe_inode_info *ipipe,
@@ -1690,9 +1988,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
1690 * Keep going, unless we encounter an error. The ipipe/opipe 1988 * Keep going, unless we encounter an error. The ipipe/opipe
1691 * ordering doesn't really matter. 1989 * ordering doesn't really matter.
1692 */ 1990 */
1693 ret = link_ipipe_prep(ipipe, flags); 1991 ret = ipipe_prep(ipipe, flags);
1694 if (!ret) { 1992 if (!ret) {
1695 ret = link_opipe_prep(opipe, flags); 1993 ret = opipe_prep(opipe, flags);
1696 if (!ret) 1994 if (!ret)
1697 ret = link_pipe(ipipe, opipe, len, flags); 1995 ret = link_pipe(ipipe, opipe, len, flags);
1698 } 1996 }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 0adc624c956f..3b52770f46ff 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -338,6 +338,8 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
338 338
339static void squashfs_put_super(struct super_block *sb) 339static void squashfs_put_super(struct super_block *sb)
340{ 340{
341 lock_kernel();
342
341 if (sb->s_fs_info) { 343 if (sb->s_fs_info) {
342 struct squashfs_sb_info *sbi = sb->s_fs_info; 344 struct squashfs_sb_info *sbi = sb->s_fs_info;
343 squashfs_cache_delete(sbi->block_cache); 345 squashfs_cache_delete(sbi->block_cache);
@@ -350,6 +352,8 @@ static void squashfs_put_super(struct super_block *sb)
350 kfree(sb->s_fs_info); 352 kfree(sb->s_fs_info);
351 sb->s_fs_info = NULL; 353 sb->s_fs_info = NULL;
352 } 354 }
355
356 unlock_kernel();
353} 357}
354 358
355 359
diff --git a/fs/super.c b/fs/super.c
index 1943fdf655fa..83b47416d006 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -28,7 +28,6 @@
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/quotaops.h> 29#include <linux/quotaops.h>
30#include <linux/namei.h> 30#include <linux/namei.h>
31#include <linux/buffer_head.h> /* for fsync_super() */
32#include <linux/mount.h> 31#include <linux/mount.h>
33#include <linux/security.h> 32#include <linux/security.h>
34#include <linux/syscalls.h> 33#include <linux/syscalls.h>
@@ -38,7 +37,6 @@
38#include <linux/kobject.h> 37#include <linux/kobject.h>
39#include <linux/mutex.h> 38#include <linux/mutex.h>
40#include <linux/file.h> 39#include <linux/file.h>
41#include <linux/async.h>
42#include <asm/uaccess.h> 40#include <asm/uaccess.h>
43#include "internal.h" 41#include "internal.h"
44 42
@@ -72,7 +70,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
72 INIT_HLIST_HEAD(&s->s_anon); 70 INIT_HLIST_HEAD(&s->s_anon);
73 INIT_LIST_HEAD(&s->s_inodes); 71 INIT_LIST_HEAD(&s->s_inodes);
74 INIT_LIST_HEAD(&s->s_dentry_lru); 72 INIT_LIST_HEAD(&s->s_dentry_lru);
75 INIT_LIST_HEAD(&s->s_async_list);
76 init_rwsem(&s->s_umount); 73 init_rwsem(&s->s_umount);
77 mutex_init(&s->s_lock); 74 mutex_init(&s->s_lock);
78 lockdep_set_class(&s->s_umount, &type->s_umount_key); 75 lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -285,38 +282,6 @@ void unlock_super(struct super_block * sb)
285EXPORT_SYMBOL(lock_super); 282EXPORT_SYMBOL(lock_super);
286EXPORT_SYMBOL(unlock_super); 283EXPORT_SYMBOL(unlock_super);
287 284
288/*
289 * Write out and wait upon all dirty data associated with this
290 * superblock. Filesystem data as well as the underlying block
291 * device. Takes the superblock lock. Requires a second blkdev
292 * flush by the caller to complete the operation.
293 */
294void __fsync_super(struct super_block *sb)
295{
296 sync_inodes_sb(sb, 0);
297 vfs_dq_sync(sb);
298 lock_super(sb);
299 if (sb->s_dirt && sb->s_op->write_super)
300 sb->s_op->write_super(sb);
301 unlock_super(sb);
302 if (sb->s_op->sync_fs)
303 sb->s_op->sync_fs(sb, 1);
304 sync_blockdev(sb->s_bdev);
305 sync_inodes_sb(sb, 1);
306}
307
308/*
309 * Write out and wait upon all dirty data associated with this
310 * superblock. Filesystem data as well as the underlying block
311 * device. Takes the superblock lock.
312 */
313int fsync_super(struct super_block *sb)
314{
315 __fsync_super(sb);
316 return sync_blockdev(sb->s_bdev);
317}
318EXPORT_SYMBOL_GPL(fsync_super);
319
320/** 285/**
321 * generic_shutdown_super - common helper for ->kill_sb() 286 * generic_shutdown_super - common helper for ->kill_sb()
322 * @sb: superblock to kill 287 * @sb: superblock to kill
@@ -338,21 +303,13 @@ void generic_shutdown_super(struct super_block *sb)
338 303
339 if (sb->s_root) { 304 if (sb->s_root) {
340 shrink_dcache_for_umount(sb); 305 shrink_dcache_for_umount(sb);
341 fsync_super(sb); 306 sync_filesystem(sb);
342 lock_super(sb); 307 get_fs_excl();
343 sb->s_flags &= ~MS_ACTIVE; 308 sb->s_flags &= ~MS_ACTIVE;
344 309
345 /*
346 * wait for asynchronous fs operations to finish before going further
347 */
348 async_synchronize_full_domain(&sb->s_async_list);
349
350 /* bad name - it should be evict_inodes() */ 310 /* bad name - it should be evict_inodes() */
351 invalidate_inodes(sb); 311 invalidate_inodes(sb);
352 lock_kernel();
353 312
354 if (sop->write_super && sb->s_dirt)
355 sop->write_super(sb);
356 if (sop->put_super) 313 if (sop->put_super)
357 sop->put_super(sb); 314 sop->put_super(sb);
358 315
@@ -362,9 +319,7 @@ void generic_shutdown_super(struct super_block *sb)
362 "Self-destruct in 5 seconds. Have a nice day...\n", 319 "Self-destruct in 5 seconds. Have a nice day...\n",
363 sb->s_id); 320 sb->s_id);
364 } 321 }
365 322 put_fs_excl();
366 unlock_kernel();
367 unlock_super(sb);
368 } 323 }
369 spin_lock(&sb_lock); 324 spin_lock(&sb_lock);
370 /* should be initialized for __put_super_and_need_restart() */ 325 /* should be initialized for __put_super_and_need_restart() */
@@ -441,16 +396,14 @@ void drop_super(struct super_block *sb)
441 396
442EXPORT_SYMBOL(drop_super); 397EXPORT_SYMBOL(drop_super);
443 398
444static inline void write_super(struct super_block *sb) 399/**
445{ 400 * sync_supers - helper for periodic superblock writeback
446 lock_super(sb); 401 *
447 if (sb->s_root && sb->s_dirt) 402 * Call the write_super method if present on all dirty superblocks in
448 if (sb->s_op->write_super) 403 * the system. This is for the periodic writeback used by most older
449 sb->s_op->write_super(sb); 404 * filesystems. For data integrity superblock writeback use
450 unlock_super(sb); 405 * sync_filesystems() instead.
451} 406 *
452
453/*
454 * Note: check the dirty flag before waiting, so we don't 407 * Note: check the dirty flag before waiting, so we don't
455 * hold up the sync while mounting a device. (The newly 408 * hold up the sync while mounting a device. (The newly
456 * mounted device won't need syncing.) 409 * mounted device won't need syncing.)
@@ -462,12 +415,15 @@ void sync_supers(void)
462 spin_lock(&sb_lock); 415 spin_lock(&sb_lock);
463restart: 416restart:
464 list_for_each_entry(sb, &super_blocks, s_list) { 417 list_for_each_entry(sb, &super_blocks, s_list) {
465 if (sb->s_dirt) { 418 if (sb->s_op->write_super && sb->s_dirt) {
466 sb->s_count++; 419 sb->s_count++;
467 spin_unlock(&sb_lock); 420 spin_unlock(&sb_lock);
421
468 down_read(&sb->s_umount); 422 down_read(&sb->s_umount);
469 write_super(sb); 423 if (sb->s_root && sb->s_dirt)
424 sb->s_op->write_super(sb);
470 up_read(&sb->s_umount); 425 up_read(&sb->s_umount);
426
471 spin_lock(&sb_lock); 427 spin_lock(&sb_lock);
472 if (__put_super_and_need_restart(sb)) 428 if (__put_super_and_need_restart(sb))
473 goto restart; 429 goto restart;
@@ -476,60 +432,6 @@ restart:
476 spin_unlock(&sb_lock); 432 spin_unlock(&sb_lock);
477} 433}
478 434
479/*
480 * Call the ->sync_fs super_op against all filesystems which are r/w and
481 * which implement it.
482 *
483 * This operation is careful to avoid the livelock which could easily happen
484 * if two or more filesystems are being continuously dirtied. s_need_sync_fs
485 * is used only here. We set it against all filesystems and then clear it as
486 * we sync them. So redirtied filesystems are skipped.
487 *
488 * But if process A is currently running sync_filesystems and then process B
489 * calls sync_filesystems as well, process B will set all the s_need_sync_fs
490 * flags again, which will cause process A to resync everything. Fix that with
491 * a local mutex.
492 *
493 * (Fabian) Avoid sync_fs with clean fs & wait mode 0
494 */
495void sync_filesystems(int wait)
496{
497 struct super_block *sb;
498 static DEFINE_MUTEX(mutex);
499
500 mutex_lock(&mutex); /* Could be down_interruptible */
501 spin_lock(&sb_lock);
502 list_for_each_entry(sb, &super_blocks, s_list) {
503 if (!sb->s_op->sync_fs)
504 continue;
505 if (sb->s_flags & MS_RDONLY)
506 continue;
507 sb->s_need_sync_fs = 1;
508 }
509
510restart:
511 list_for_each_entry(sb, &super_blocks, s_list) {
512 if (!sb->s_need_sync_fs)
513 continue;
514 sb->s_need_sync_fs = 0;
515 if (sb->s_flags & MS_RDONLY)
516 continue; /* hm. Was remounted r/o meanwhile */
517 sb->s_count++;
518 spin_unlock(&sb_lock);
519 down_read(&sb->s_umount);
520 async_synchronize_full_domain(&sb->s_async_list);
521 if (sb->s_root && (wait || sb->s_dirt))
522 sb->s_op->sync_fs(sb, wait);
523 up_read(&sb->s_umount);
524 /* restart only when sb is no longer on the list */
525 spin_lock(&sb_lock);
526 if (__put_super_and_need_restart(sb))
527 goto restart;
528 }
529 spin_unlock(&sb_lock);
530 mutex_unlock(&mutex);
531}
532
533/** 435/**
534 * get_super - get the superblock of a device 436 * get_super - get the superblock of a device
535 * @bdev: device to get the superblock for 437 * @bdev: device to get the superblock for
@@ -616,45 +518,6 @@ out:
616} 518}
617 519
618/** 520/**
619 * mark_files_ro - mark all files read-only
620 * @sb: superblock in question
621 *
622 * All files are marked read-only. We don't care about pending
623 * delete files so this should be used in 'force' mode only.
624 */
625
626static void mark_files_ro(struct super_block *sb)
627{
628 struct file *f;
629
630retry:
631 file_list_lock();
632 list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
633 struct vfsmount *mnt;
634 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
635 continue;
636 if (!file_count(f))
637 continue;
638 if (!(f->f_mode & FMODE_WRITE))
639 continue;
640 f->f_mode &= ~FMODE_WRITE;
641 if (file_check_writeable(f) != 0)
642 continue;
643 file_release_write(f);
644 mnt = mntget(f->f_path.mnt);
645 file_list_unlock();
646 /*
647 * This can sleep, so we can't hold
648 * the file_list_lock() spinlock.
649 */
650 mnt_drop_write(mnt);
651 mntput(mnt);
652 goto retry;
653 }
654 file_list_unlock();
655}
656
657/**
658 * do_remount_sb - asks filesystem to change mount options. 521 * do_remount_sb - asks filesystem to change mount options.
659 * @sb: superblock in question 522 * @sb: superblock in question
660 * @flags: numeric part of options 523 * @flags: numeric part of options
@@ -675,27 +538,31 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
675 if (flags & MS_RDONLY) 538 if (flags & MS_RDONLY)
676 acct_auto_close(sb); 539 acct_auto_close(sb);
677 shrink_dcache_sb(sb); 540 shrink_dcache_sb(sb);
678 fsync_super(sb); 541 sync_filesystem(sb);
679 542
680 /* If we are remounting RDONLY and current sb is read/write, 543 /* If we are remounting RDONLY and current sb is read/write,
681 make sure there are no rw files opened */ 544 make sure there are no rw files opened */
682 if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { 545 if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
683 if (force) 546 if (force)
684 mark_files_ro(sb); 547 mark_files_ro(sb);
685 else if (!fs_may_remount_ro(sb)) 548 else if (!fs_may_remount_ro(sb)) {
549 unlock_kernel();
686 return -EBUSY; 550 return -EBUSY;
551 }
687 retval = vfs_dq_off(sb, 1); 552 retval = vfs_dq_off(sb, 1);
688 if (retval < 0 && retval != -ENOSYS) 553 if (retval < 0 && retval != -ENOSYS) {
554 unlock_kernel();
689 return -EBUSY; 555 return -EBUSY;
556 }
690 } 557 }
691 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); 558 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
692 559
693 if (sb->s_op->remount_fs) { 560 if (sb->s_op->remount_fs) {
694 lock_super(sb);
695 retval = sb->s_op->remount_fs(sb, &flags, data); 561 retval = sb->s_op->remount_fs(sb, &flags, data);
696 unlock_super(sb); 562 if (retval) {
697 if (retval) 563 unlock_kernel();
698 return retval; 564 return retval;
565 }
699 } 566 }
700 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 567 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
701 if (remount_rw) 568 if (remount_rw)
@@ -711,18 +578,17 @@ static void do_emergency_remount(struct work_struct *work)
711 list_for_each_entry(sb, &super_blocks, s_list) { 578 list_for_each_entry(sb, &super_blocks, s_list) {
712 sb->s_count++; 579 sb->s_count++;
713 spin_unlock(&sb_lock); 580 spin_unlock(&sb_lock);
714 down_read(&sb->s_umount); 581 down_write(&sb->s_umount);
715 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { 582 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
716 /* 583 /*
717 * ->remount_fs needs lock_kernel(). 584 * ->remount_fs needs lock_kernel().
718 * 585 *
719 * What lock protects sb->s_flags?? 586 * What lock protects sb->s_flags??
720 */ 587 */
721 lock_kernel();
722 do_remount_sb(sb, MS_RDONLY, NULL, 1); 588 do_remount_sb(sb, MS_RDONLY, NULL, 1);
723 unlock_kernel();
724 } 589 }
725 drop_super(sb); 590 up_write(&sb->s_umount);
591 put_super(sb);
726 spin_lock(&sb_lock); 592 spin_lock(&sb_lock);
727 } 593 }
728 spin_unlock(&sb_lock); 594 spin_unlock(&sb_lock);
diff --git a/fs/sync.c b/fs/sync.c
index 7abc65fbf21d..dd200025af85 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -13,38 +13,123 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/quotaops.h> 14#include <linux/quotaops.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include "internal.h"
16 17
17#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ 18#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
18 SYNC_FILE_RANGE_WAIT_AFTER) 19 SYNC_FILE_RANGE_WAIT_AFTER)
19 20
20/* 21/*
21 * sync everything. Start out by waking pdflush, because that writes back 22 * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
22 * all queues in parallel. 23 * just dirties buffers with inodes so we have to submit IO for these buffers
24 * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
25 * case write_inode() functions do sync_dirty_buffer() and thus effectively
26 * write one block at a time.
23 */ 27 */
24static void do_sync(unsigned long wait) 28static int __sync_filesystem(struct super_block *sb, int wait)
25{ 29{
26 wakeup_pdflush(0); 30 /* Avoid doing twice syncing and cache pruning for quota sync */
27 sync_inodes(0); /* All mappings, inodes and their blockdevs */
28 vfs_dq_sync(NULL);
29 sync_supers(); /* Write the superblocks */
30 sync_filesystems(0); /* Start syncing the filesystems */
31 sync_filesystems(wait); /* Waitingly sync the filesystems */
32 sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
33 if (!wait) 31 if (!wait)
34 printk("Emergency Sync complete\n"); 32 writeout_quota_sb(sb, -1);
35 if (unlikely(laptop_mode)) 33 else
36 laptop_sync_completion(); 34 sync_quota_sb(sb, -1);
35 sync_inodes_sb(sb, wait);
36 if (sb->s_op->sync_fs)
37 sb->s_op->sync_fs(sb, wait);
38 return __sync_blockdev(sb->s_bdev, wait);
39}
40
41/*
42 * Write out and wait upon all dirty data associated with this
43 * superblock. Filesystem data as well as the underlying block
44 * device. Takes the superblock lock.
45 */
46int sync_filesystem(struct super_block *sb)
47{
48 int ret;
49
50 /*
51 * We need to be protected against the filesystem going from
52 * r/o to r/w or vice versa.
53 */
54 WARN_ON(!rwsem_is_locked(&sb->s_umount));
55
56 /*
57 * No point in syncing out anything if the filesystem is read-only.
58 */
59 if (sb->s_flags & MS_RDONLY)
60 return 0;
61
62 ret = __sync_filesystem(sb, 0);
63 if (ret < 0)
64 return ret;
65 return __sync_filesystem(sb, 1);
66}
67EXPORT_SYMBOL_GPL(sync_filesystem);
68
69/*
70 * Sync all the data for all the filesystems (called by sys_sync() and
71 * emergency sync)
72 *
73 * This operation is careful to avoid the livelock which could easily happen
74 * if two or more filesystems are being continuously dirtied. s_need_sync
75 * is used only here. We set it against all filesystems and then clear it as
76 * we sync them. So redirtied filesystems are skipped.
77 *
78 * But if process A is currently running sync_filesystems and then process B
79 * calls sync_filesystems as well, process B will set all the s_need_sync
80 * flags again, which will cause process A to resync everything. Fix that with
81 * a local mutex.
82 */
83static void sync_filesystems(int wait)
84{
85 struct super_block *sb;
86 static DEFINE_MUTEX(mutex);
87
88 mutex_lock(&mutex); /* Could be down_interruptible */
89 spin_lock(&sb_lock);
90 list_for_each_entry(sb, &super_blocks, s_list)
91 sb->s_need_sync = 1;
92
93restart:
94 list_for_each_entry(sb, &super_blocks, s_list) {
95 if (!sb->s_need_sync)
96 continue;
97 sb->s_need_sync = 0;
98 sb->s_count++;
99 spin_unlock(&sb_lock);
100
101 down_read(&sb->s_umount);
102 if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
103 __sync_filesystem(sb, wait);
104 up_read(&sb->s_umount);
105
106 /* restart only when sb is no longer on the list */
107 spin_lock(&sb_lock);
108 if (__put_super_and_need_restart(sb))
109 goto restart;
110 }
111 spin_unlock(&sb_lock);
112 mutex_unlock(&mutex);
37} 113}
38 114
39SYSCALL_DEFINE0(sync) 115SYSCALL_DEFINE0(sync)
40{ 116{
41 do_sync(1); 117 sync_filesystems(0);
118 sync_filesystems(1);
119 if (unlikely(laptop_mode))
120 laptop_sync_completion();
42 return 0; 121 return 0;
43} 122}
44 123
45static void do_sync_work(struct work_struct *work) 124static void do_sync_work(struct work_struct *work)
46{ 125{
47 do_sync(0); 126 /*
127 * Sync twice to reduce the possibility we skipped some inodes / pages
128 * because they were temporarily locked
129 */
130 sync_filesystems(0);
131 sync_filesystems(0);
132 printk("Emergency Sync complete\n");
48 kfree(work); 133 kfree(work);
49} 134}
50 135
@@ -75,10 +160,8 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
75 160
76 /* sync the superblock to buffers */ 161 /* sync the superblock to buffers */
77 sb = inode->i_sb; 162 sb = inode->i_sb;
78 lock_super(sb);
79 if (sb->s_dirt && sb->s_op->write_super) 163 if (sb->s_dirt && sb->s_op->write_super)
80 sb->s_op->write_super(sb); 164 sb->s_op->write_super(sb);
81 unlock_super(sb);
82 165
83 /* .. finally sync the buffers to disk */ 166 /* .. finally sync the buffers to disk */
84 err = sync_blockdev(sb->s_bdev); 167 err = sync_blockdev(sb->s_bdev);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 56f655254bfe..c7798079e644 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ static int sysv_readdir(struct file *, void *, filldir_t);
24const struct file_operations sysv_dir_operations = { 24const struct file_operations sysv_dir_operations = {
25 .read = generic_read_dir, 25 .read = generic_read_dir,
26 .readdir = sysv_readdir, 26 .readdir = sysv_readdir,
27 .fsync = sysv_sync_file, 27 .fsync = simple_fsync,
28}; 28};
29 29
30static inline void dir_put_page(struct page *page) 30static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 589be21d884e..96340c01f4a7 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
26 .write = do_sync_write, 26 .write = do_sync_write,
27 .aio_write = generic_file_aio_write, 27 .aio_write = generic_file_aio_write,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = sysv_sync_file, 29 .fsync = simple_fsync,
30 .splice_read = generic_file_splice_read, 30 .splice_read = generic_file_splice_read,
31}; 31};
32 32
@@ -34,18 +34,3 @@ const struct inode_operations sysv_file_inode_operations = {
34 .truncate = sysv_truncate, 34 .truncate = sysv_truncate,
35 .getattr = sysv_getattr, 35 .getattr = sysv_getattr,
36}; 36};
37
38int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync)
39{
40 struct inode *inode = dentry->d_inode;
41 int err;
42
43 err = sync_mapping_buffers(inode->i_mapping);
44 if (!(inode->i_state & I_DIRTY))
45 return err;
46 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
47 return err;
48
49 err |= sysv_sync_inode(inode);
50 return err ? -EIO : 0;
51}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index da20b48d350f..479923456a54 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -31,15 +31,13 @@
31#include <asm/byteorder.h> 31#include <asm/byteorder.h>
32#include "sysv.h" 32#include "sysv.h"
33 33
34/* This is only called on sync() and umount(), when s_dirt=1. */ 34static int sysv_sync_fs(struct super_block *sb, int wait)
35static void sysv_write_super(struct super_block *sb)
36{ 35{
37 struct sysv_sb_info *sbi = SYSV_SB(sb); 36 struct sysv_sb_info *sbi = SYSV_SB(sb);
38 unsigned long time = get_seconds(), old_time; 37 unsigned long time = get_seconds(), old_time;
39 38
39 lock_super(sb);
40 lock_kernel(); 40 lock_kernel();
41 if (sb->s_flags & MS_RDONLY)
42 goto clean;
43 41
44 /* 42 /*
45 * If we are going to write out the super block, 43 * If we are going to write out the super block,
@@ -53,18 +51,30 @@ static void sysv_write_super(struct super_block *sb)
53 *sbi->s_sb_time = cpu_to_fs32(sbi, time); 51 *sbi->s_sb_time = cpu_to_fs32(sbi, time);
54 mark_buffer_dirty(sbi->s_bh2); 52 mark_buffer_dirty(sbi->s_bh2);
55 } 53 }
56clean: 54
57 sb->s_dirt = 0;
58 unlock_kernel(); 55 unlock_kernel();
56 unlock_super(sb);
57
58 return 0;
59}
60
61static void sysv_write_super(struct super_block *sb)
62{
63 if (!(sb->s_flags & MS_RDONLY))
64 sysv_sync_fs(sb, 1);
65 else
66 sb->s_dirt = 0;
59} 67}
60 68
61static int sysv_remount(struct super_block *sb, int *flags, char *data) 69static int sysv_remount(struct super_block *sb, int *flags, char *data)
62{ 70{
63 struct sysv_sb_info *sbi = SYSV_SB(sb); 71 struct sysv_sb_info *sbi = SYSV_SB(sb);
72 lock_super(sb);
64 if (sbi->s_forced_ro) 73 if (sbi->s_forced_ro)
65 *flags |= MS_RDONLY; 74 *flags |= MS_RDONLY;
66 if (!(*flags & MS_RDONLY)) 75 if (!(*flags & MS_RDONLY))
67 sb->s_dirt = 1; 76 sb->s_dirt = 1;
77 unlock_super(sb);
68 return 0; 78 return 0;
69} 79}
70 80
@@ -72,6 +82,11 @@ static void sysv_put_super(struct super_block *sb)
72{ 82{
73 struct sysv_sb_info *sbi = SYSV_SB(sb); 83 struct sysv_sb_info *sbi = SYSV_SB(sb);
74 84
85 lock_kernel();
86
87 if (sb->s_dirt)
88 sysv_write_super(sb);
89
75 if (!(sb->s_flags & MS_RDONLY)) { 90 if (!(sb->s_flags & MS_RDONLY)) {
76 /* XXX ext2 also updates the state here */ 91 /* XXX ext2 also updates the state here */
77 mark_buffer_dirty(sbi->s_bh1); 92 mark_buffer_dirty(sbi->s_bh1);
@@ -84,6 +99,8 @@ static void sysv_put_super(struct super_block *sb)
84 brelse(sbi->s_bh2); 99 brelse(sbi->s_bh2);
85 100
86 kfree(sbi); 101 kfree(sbi);
102
103 unlock_kernel();
87} 104}
88 105
89static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) 106static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -236,7 +253,7 @@ bad_inode:
236 return ERR_PTR(-EIO); 253 return ERR_PTR(-EIO);
237} 254}
238 255
239static struct buffer_head * sysv_update_inode(struct inode * inode) 256int sysv_write_inode(struct inode *inode, int wait)
240{ 257{
241 struct super_block * sb = inode->i_sb; 258 struct super_block * sb = inode->i_sb;
242 struct sysv_sb_info * sbi = SYSV_SB(sb); 259 struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -244,19 +261,21 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
244 struct sysv_inode * raw_inode; 261 struct sysv_inode * raw_inode;
245 struct sysv_inode_info * si; 262 struct sysv_inode_info * si;
246 unsigned int ino, block; 263 unsigned int ino, block;
264 int err = 0;
247 265
248 ino = inode->i_ino; 266 ino = inode->i_ino;
249 if (!ino || ino > sbi->s_ninodes) { 267 if (!ino || ino > sbi->s_ninodes) {
250 printk("Bad inode number on dev %s: %d is out of range\n", 268 printk("Bad inode number on dev %s: %d is out of range\n",
251 inode->i_sb->s_id, ino); 269 inode->i_sb->s_id, ino);
252 return NULL; 270 return -EIO;
253 } 271 }
254 raw_inode = sysv_raw_inode(sb, ino, &bh); 272 raw_inode = sysv_raw_inode(sb, ino, &bh);
255 if (!raw_inode) { 273 if (!raw_inode) {
256 printk("unable to read i-node block\n"); 274 printk("unable to read i-node block\n");
257 return NULL; 275 return -EIO;
258 } 276 }
259 277
278 lock_kernel();
260 raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); 279 raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
261 raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid)); 280 raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid));
262 raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid)); 281 raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid));
@@ -272,38 +291,23 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
272 for (block = 0; block < 10+1+1+1; block++) 291 for (block = 0; block < 10+1+1+1; block++)
273 write3byte(sbi, (u8 *)&si->i_data[block], 292 write3byte(sbi, (u8 *)&si->i_data[block],
274 &raw_inode->i_data[3*block]); 293 &raw_inode->i_data[3*block]);
294 unlock_kernel();
275 mark_buffer_dirty(bh); 295 mark_buffer_dirty(bh);
276 return bh; 296 if (wait) {
277} 297 sync_dirty_buffer(bh);
278 298 if (buffer_req(bh) && !buffer_uptodate(bh)) {
279int sysv_write_inode(struct inode * inode, int wait) 299 printk ("IO error syncing sysv inode [%s:%08x]\n",
280{ 300 sb->s_id, ino);
281 struct buffer_head *bh; 301 err = -EIO;
282 lock_kernel(); 302 }
283 bh = sysv_update_inode(inode); 303 }
284 brelse(bh); 304 brelse(bh);
285 unlock_kernel();
286 return 0; 305 return 0;
287} 306}
288 307
289int sysv_sync_inode(struct inode * inode) 308int sysv_sync_inode(struct inode *inode)
290{ 309{
291 int err = 0; 310 return sysv_write_inode(inode, 1);
292 struct buffer_head *bh;
293
294 bh = sysv_update_inode(inode);
295 if (bh && buffer_dirty(bh)) {
296 sync_dirty_buffer(bh);
297 if (buffer_req(bh) && !buffer_uptodate(bh)) {
298 printk ("IO error syncing sysv inode [%s:%08lx]\n",
299 inode->i_sb->s_id, inode->i_ino);
300 err = -1;
301 }
302 }
303 else if (!bh)
304 err = -1;
305 brelse (bh);
306 return err;
307} 311}
308 312
309static void sysv_delete_inode(struct inode *inode) 313static void sysv_delete_inode(struct inode *inode)
@@ -347,6 +351,7 @@ const struct super_operations sysv_sops = {
347 .delete_inode = sysv_delete_inode, 351 .delete_inode = sysv_delete_inode,
348 .put_super = sysv_put_super, 352 .put_super = sysv_put_super,
349 .write_super = sysv_write_super, 353 .write_super = sysv_write_super,
354 .sync_fs = sysv_sync_fs,
350 .remount_fs = sysv_remount, 355 .remount_fs = sysv_remount,
351 .statfs = sysv_statfs, 356 .statfs = sysv_statfs,
352}; 357};
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 5784a318c883..53786eb5cf60 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -144,7 +144,6 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
144extern struct inode *sysv_iget(struct super_block *, unsigned int); 144extern struct inode *sysv_iget(struct super_block *, unsigned int);
145extern int sysv_write_inode(struct inode *, int); 145extern int sysv_write_inode(struct inode *, int);
146extern int sysv_sync_inode(struct inode *); 146extern int sysv_sync_inode(struct inode *);
147extern int sysv_sync_file(struct file *, struct dentry *, int);
148extern void sysv_set_inode(struct inode *, dev_t); 147extern void sysv_set_inode(struct inode *, dev_t);
149extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); 148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
150extern int sysv_init_icache(void); 149extern int sysv_init_icache(void);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e9f7a754c4f7..3589eab02a2f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,7 @@
36#include <linux/mount.h> 36#include <linux/mount.h>
37#include <linux/math64.h> 37#include <linux/math64.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/smp_lock.h>
39#include "ubifs.h" 40#include "ubifs.h"
40 41
41/* 42/*
@@ -447,9 +448,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
447 if (!wait) 448 if (!wait)
448 return 0; 449 return 0;
449 450
450 if (sb->s_flags & MS_RDONLY)
451 return 0;
452
453 /* 451 /*
454 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and 452 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
455 * pages, so synchronize them first, then commit the journal. Strictly 453 * pages, so synchronize them first, then commit the journal. Strictly
@@ -1687,6 +1685,9 @@ static void ubifs_put_super(struct super_block *sb)
1687 1685
1688 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, 1686 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
1689 c->vi.vol_id); 1687 c->vi.vol_id);
1688
1689 lock_kernel();
1690
1690 /* 1691 /*
1691 * The following asserts are only valid if there has not been a failure 1692 * The following asserts are only valid if there has not been a failure
1692 * of the media. For example, there will be dirty inodes if we failed 1693 * of the media. For example, there will be dirty inodes if we failed
@@ -1753,6 +1754,8 @@ static void ubifs_put_super(struct super_block *sb)
1753 ubi_close_volume(c->ubi); 1754 ubi_close_volume(c->ubi);
1754 mutex_unlock(&c->umount_mutex); 1755 mutex_unlock(&c->umount_mutex);
1755 kfree(c); 1756 kfree(c);
1757
1758 unlock_kernel();
1756} 1759}
1757 1760
1758static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) 1761static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1768,17 +1771,22 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1768 return err; 1771 return err;
1769 } 1772 }
1770 1773
1774 lock_kernel();
1771 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1775 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1772 if (c->ro_media) { 1776 if (c->ro_media) {
1773 ubifs_msg("cannot re-mount due to prior errors"); 1777 ubifs_msg("cannot re-mount due to prior errors");
1778 unlock_kernel();
1774 return -EROFS; 1779 return -EROFS;
1775 } 1780 }
1776 err = ubifs_remount_rw(c); 1781 err = ubifs_remount_rw(c);
1777 if (err) 1782 if (err) {
1783 unlock_kernel();
1778 return err; 1784 return err;
1785 }
1779 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 1786 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
1780 if (c->ro_media) { 1787 if (c->ro_media) {
1781 ubifs_msg("cannot re-mount due to prior errors"); 1788 ubifs_msg("cannot re-mount due to prior errors");
1789 unlock_kernel();
1782 return -EROFS; 1790 return -EROFS;
1783 } 1791 }
1784 ubifs_remount_ro(c); 1792 ubifs_remount_ro(c);
@@ -1793,6 +1801,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1793 } 1801 }
1794 1802
1795 ubifs_assert(c->lst.taken_empty_lebs > 0); 1803 ubifs_assert(c->lst.taken_empty_lebs > 0);
1804 unlock_kernel();
1796 return 0; 1805 return 0;
1797} 1806}
1798 1807
diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index 0d4503f7446d..eb880f66c23a 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -5,5 +5,5 @@
5obj-$(CONFIG_UDF_FS) += udf.o 5obj-$(CONFIG_UDF_FS) += udf.o
6 6
7udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \ 7udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
8 partition.o super.o truncate.o symlink.o fsync.o \ 8 partition.o super.o truncate.o symlink.o \
9 directory.o misc.o udftime.o unicode.o 9 directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 2efd4d5291b6..61d9a76a3a69 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = {
210 .read = generic_read_dir, 210 .read = generic_read_dir,
211 .readdir = udf_readdir, 211 .readdir = udf_readdir,
212 .ioctl = udf_ioctl, 212 .ioctl = udf_ioctl,
213 .fsync = udf_fsync_file, 213 .fsync = simple_fsync,
214}; 214};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index eb91f3b70320..7464305382b5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = {
209 .write = do_sync_write, 209 .write = do_sync_write,
210 .aio_write = udf_file_aio_write, 210 .aio_write = udf_file_aio_write,
211 .release = udf_release_file, 211 .release = udf_release_file,
212 .fsync = udf_fsync_file, 212 .fsync = simple_fsync,
213 .splice_read = generic_file_splice_read, 213 .splice_read = generic_file_splice_read,
214 .llseek = generic_file_llseek, 214 .llseek = generic_file_llseek,
215}; 215};
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
deleted file mode 100644
index b2c472b733b8..000000000000
--- a/fs/udf/fsync.c
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * fsync.c
3 *
4 * PURPOSE
5 * Fsync handling routines for the OSTA-UDF(tm) filesystem.
6 *
7 * COPYRIGHT
8 * This file is distributed under the terms of the GNU General Public
9 * License (GPL). Copies of the GPL can be obtained from:
10 * ftp://prep.ai.mit.edu/pub/gnu/GPL
11 * Each contributing author retains all rights to their own work.
12 *
13 * (C) 1999-2001 Ben Fennema
14 * (C) 1999-2000 Stelias Computing Inc
15 *
16 * HISTORY
17 *
18 * 05/22/99 blf Created.
19 */
20
21#include "udfdecl.h"
22
23#include <linux/fs.h>
24
25static int udf_fsync_inode(struct inode *, int);
26
27/*
28 * File may be NULL when we are called. Perhaps we shouldn't
29 * even pass file to fsync ?
30 */
31
32int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync)
33{
34 struct inode *inode = dentry->d_inode;
35
36 return udf_fsync_inode(inode, datasync);
37}
38
39static int udf_fsync_inode(struct inode *inode, int datasync)
40{
41 int err;
42
43 err = sync_mapping_buffers(inode->i_mapping);
44 if (!(inode->i_state & I_DIRTY))
45 return err;
46 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
47 return err;
48
49 err |= udf_sync_inode(inode);
50
51 return err ? -EIO : 0;
52}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc855a4..6832135159b6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -568,6 +568,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
568 if (!udf_parse_options(options, &uopt, true)) 568 if (!udf_parse_options(options, &uopt, true))
569 return -EINVAL; 569 return -EINVAL;
570 570
571 lock_kernel();
571 sbi->s_flags = uopt.flags; 572 sbi->s_flags = uopt.flags;
572 sbi->s_uid = uopt.uid; 573 sbi->s_uid = uopt.uid;
573 sbi->s_gid = uopt.gid; 574 sbi->s_gid = uopt.gid;
@@ -581,13 +582,16 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
581 *flags |= MS_RDONLY; 582 *flags |= MS_RDONLY;
582 } 583 }
583 584
584 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 585 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
586 unlock_kernel();
585 return 0; 587 return 0;
588 }
586 if (*flags & MS_RDONLY) 589 if (*flags & MS_RDONLY)
587 udf_close_lvid(sb); 590 udf_close_lvid(sb);
588 else 591 else
589 udf_open_lvid(sb); 592 udf_open_lvid(sb);
590 593
594 unlock_kernel();
591 return 0; 595 return 0;
592} 596}
593 597
@@ -1915,7 +1919,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1915 if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) { 1919 if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
1916 ret = udf_load_vrs(sb, &uopt, silent, &fileset); 1920 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1917 } else { 1921 } else {
1918 uopt.blocksize = bdev_hardsect_size(sb->s_bdev); 1922 uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
1919 ret = udf_load_vrs(sb, &uopt, silent, &fileset); 1923 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1920 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { 1924 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
1921 if (!silent) 1925 if (!silent)
@@ -2062,6 +2066,9 @@ static void udf_put_super(struct super_block *sb)
2062 struct udf_sb_info *sbi; 2066 struct udf_sb_info *sbi;
2063 2067
2064 sbi = UDF_SB(sb); 2068 sbi = UDF_SB(sb);
2069
2070 lock_kernel();
2071
2065 if (sbi->s_vat_inode) 2072 if (sbi->s_vat_inode)
2066 iput(sbi->s_vat_inode); 2073 iput(sbi->s_vat_inode);
2067 if (sbi->s_partitions) 2074 if (sbi->s_partitions)
@@ -2077,6 +2084,8 @@ static void udf_put_super(struct super_block *sb)
2077 kfree(sbi->s_partmaps); 2084 kfree(sbi->s_partmaps);
2078 kfree(sb->s_fs_info); 2085 kfree(sb->s_fs_info);
2079 sb->s_fs_info = NULL; 2086 sb->s_fs_info = NULL;
2087
2088 unlock_kernel();
2080} 2089}
2081 2090
2082static int udf_sync_fs(struct super_block *sb, int wait) 2091static int udf_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index cac51b77a5d1..8d46f4294ee7 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -223,9 +223,6 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
223extern int udf_new_block(struct super_block *, struct inode *, uint16_t, 223extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
224 uint32_t, int *); 224 uint32_t, int *);
225 225
226/* fsync.c */
227extern int udf_fsync_file(struct file *, struct dentry *, int);
228
229/* directory.c */ 226/* directory.c */
230extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, 227extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
231 struct udf_fileident_bh *, 228 struct udf_fileident_bh *,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6321b797061b..6f671f1ac271 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
666const struct file_operations ufs_dir_operations = { 666const struct file_operations ufs_dir_operations = {
667 .read = generic_read_dir, 667 .read = generic_read_dir,
668 .readdir = ufs_readdir, 668 .readdir = ufs_readdir,
669 .fsync = ufs_sync_file, 669 .fsync = simple_fsync,
670 .llseek = generic_file_llseek, 670 .llseek = generic_file_llseek,
671}; 671};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 2bd3a1615714..73655c61240a 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,31 +24,10 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/buffer_head.h> /* for sync_mapping_buffers() */
28 27
29#include "ufs_fs.h" 28#include "ufs_fs.h"
30#include "ufs.h" 29#include "ufs.h"
31 30
32
33int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
34{
35 struct inode *inode = dentry->d_inode;
36 int err;
37 int ret;
38
39 ret = sync_mapping_buffers(inode->i_mapping);
40 if (!(inode->i_state & I_DIRTY))
41 return ret;
42 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
43 return ret;
44
45 err = ufs_sync_inode(inode);
46 if (ret == 0)
47 ret = err;
48 return ret;
49}
50
51
52/* 31/*
53 * We have mostly NULL's here: the current defaults are ok for 32 * We have mostly NULL's here: the current defaults are ok for
54 * the ufs filesystem. 33 * the ufs filesystem.
@@ -62,6 +41,6 @@ const struct file_operations ufs_file_operations = {
62 .aio_write = generic_file_aio_write, 41 .aio_write = generic_file_aio_write,
63 .mmap = generic_file_mmap, 42 .mmap = generic_file_mmap,
64 .open = generic_file_open, 43 .open = generic_file_open,
65 .fsync = ufs_sync_file, 44 .fsync = simple_fsync,
66 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
67}; 46};
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 60359291761f..5faed7954d0a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -263,6 +263,7 @@ void ufs_panic (struct super_block * sb, const char * function,
263 struct ufs_super_block_first * usb1; 263 struct ufs_super_block_first * usb1;
264 va_list args; 264 va_list args;
265 265
266 lock_kernel();
266 uspi = UFS_SB(sb)->s_uspi; 267 uspi = UFS_SB(sb)->s_uspi;
267 usb1 = ubh_get_usb_first(uspi); 268 usb1 = ubh_get_usb_first(uspi);
268 269
@@ -594,6 +595,9 @@ static void ufs_put_super_internal(struct super_block *sb)
594 595
595 596
596 UFSD("ENTER\n"); 597 UFSD("ENTER\n");
598
599 lock_kernel();
600
597 ufs_put_cstotal(sb); 601 ufs_put_cstotal(sb);
598 size = uspi->s_cssize; 602 size = uspi->s_cssize;
599 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; 603 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -621,6 +625,9 @@ static void ufs_put_super_internal(struct super_block *sb)
621 brelse (sbi->s_ucg[i]); 625 brelse (sbi->s_ucg[i]);
622 kfree (sbi->s_ucg); 626 kfree (sbi->s_ucg);
623 kfree (base); 627 kfree (base);
628
629 unlock_kernel();
630
624 UFSD("EXIT\n"); 631 UFSD("EXIT\n");
625} 632}
626 633
@@ -1118,32 +1125,45 @@ failed_nomem:
1118 return -ENOMEM; 1125 return -ENOMEM;
1119} 1126}
1120 1127
1121static void ufs_write_super(struct super_block *sb) 1128static int ufs_sync_fs(struct super_block *sb, int wait)
1122{ 1129{
1123 struct ufs_sb_private_info * uspi; 1130 struct ufs_sb_private_info * uspi;
1124 struct ufs_super_block_first * usb1; 1131 struct ufs_super_block_first * usb1;
1125 struct ufs_super_block_third * usb3; 1132 struct ufs_super_block_third * usb3;
1126 unsigned flags; 1133 unsigned flags;
1127 1134
1135 lock_super(sb);
1128 lock_kernel(); 1136 lock_kernel();
1137
1129 UFSD("ENTER\n"); 1138 UFSD("ENTER\n");
1139
1130 flags = UFS_SB(sb)->s_flags; 1140 flags = UFS_SB(sb)->s_flags;
1131 uspi = UFS_SB(sb)->s_uspi; 1141 uspi = UFS_SB(sb)->s_uspi;
1132 usb1 = ubh_get_usb_first(uspi); 1142 usb1 = ubh_get_usb_first(uspi);
1133 usb3 = ubh_get_usb_third(uspi); 1143 usb3 = ubh_get_usb_third(uspi);
1134 1144
1135 if (!(sb->s_flags & MS_RDONLY)) { 1145 usb1->fs_time = cpu_to_fs32(sb, get_seconds());
1136 usb1->fs_time = cpu_to_fs32(sb, get_seconds()); 1146 if ((flags & UFS_ST_MASK) == UFS_ST_SUN ||
1137 if ((flags & UFS_ST_MASK) == UFS_ST_SUN 1147 (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
1138 || (flags & UFS_ST_MASK) == UFS_ST_SUNOS 1148 (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
1139 || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 1149 ufs_set_fs_state(sb, usb1, usb3,
1140 ufs_set_fs_state(sb, usb1, usb3, 1150 UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
1141 UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); 1151 ufs_put_cstotal(sb);
1142 ufs_put_cstotal(sb);
1143 }
1144 sb->s_dirt = 0; 1152 sb->s_dirt = 0;
1153
1145 UFSD("EXIT\n"); 1154 UFSD("EXIT\n");
1146 unlock_kernel(); 1155 unlock_kernel();
1156 unlock_super(sb);
1157
1158 return 0;
1159}
1160
1161static void ufs_write_super(struct super_block *sb)
1162{
1163 if (!(sb->s_flags & MS_RDONLY))
1164 ufs_sync_fs(sb, 1);
1165 else
1166 sb->s_dirt = 0;
1147} 1167}
1148 1168
1149static void ufs_put_super(struct super_block *sb) 1169static void ufs_put_super(struct super_block *sb)
@@ -1152,6 +1172,9 @@ static void ufs_put_super(struct super_block *sb)
1152 1172
1153 UFSD("ENTER\n"); 1173 UFSD("ENTER\n");
1154 1174
1175 if (sb->s_dirt)
1176 ufs_write_super(sb);
1177
1155 if (!(sb->s_flags & MS_RDONLY)) 1178 if (!(sb->s_flags & MS_RDONLY))
1156 ufs_put_super_internal(sb); 1179 ufs_put_super_internal(sb);
1157 1180
@@ -1171,7 +1194,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1171 struct ufs_super_block_third * usb3; 1194 struct ufs_super_block_third * usb3;
1172 unsigned new_mount_opt, ufstype; 1195 unsigned new_mount_opt, ufstype;
1173 unsigned flags; 1196 unsigned flags;
1174 1197
1198 lock_kernel();
1199 lock_super(sb);
1175 uspi = UFS_SB(sb)->s_uspi; 1200 uspi = UFS_SB(sb)->s_uspi;
1176 flags = UFS_SB(sb)->s_flags; 1201 flags = UFS_SB(sb)->s_flags;
1177 usb1 = ubh_get_usb_first(uspi); 1202 usb1 = ubh_get_usb_first(uspi);
@@ -1184,17 +1209,24 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1184 ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; 1209 ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
1185 new_mount_opt = 0; 1210 new_mount_opt = 0;
1186 ufs_set_opt (new_mount_opt, ONERROR_LOCK); 1211 ufs_set_opt (new_mount_opt, ONERROR_LOCK);
1187 if (!ufs_parse_options (data, &new_mount_opt)) 1212 if (!ufs_parse_options (data, &new_mount_opt)) {
1213 unlock_super(sb);
1214 unlock_kernel();
1188 return -EINVAL; 1215 return -EINVAL;
1216 }
1189 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { 1217 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
1190 new_mount_opt |= ufstype; 1218 new_mount_opt |= ufstype;
1191 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { 1219 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
1192 printk("ufstype can't be changed during remount\n"); 1220 printk("ufstype can't be changed during remount\n");
1221 unlock_super(sb);
1222 unlock_kernel();
1193 return -EINVAL; 1223 return -EINVAL;
1194 } 1224 }
1195 1225
1196 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1226 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1197 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1227 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1228 unlock_super(sb);
1229 unlock_kernel();
1198 return 0; 1230 return 0;
1199 } 1231 }
1200 1232
@@ -1219,6 +1251,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1219#ifndef CONFIG_UFS_FS_WRITE 1251#ifndef CONFIG_UFS_FS_WRITE
1220 printk("ufs was compiled with read-only support, " 1252 printk("ufs was compiled with read-only support, "
1221 "can't be mounted as read-write\n"); 1253 "can't be mounted as read-write\n");
1254 unlock_super(sb);
1255 unlock_kernel();
1222 return -EINVAL; 1256 return -EINVAL;
1223#else 1257#else
1224 if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 1258 if (ufstype != UFS_MOUNT_UFSTYPE_SUN &&
@@ -1227,16 +1261,22 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1227 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && 1261 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
1228 ufstype != UFS_MOUNT_UFSTYPE_UFS2) { 1262 ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
1229 printk("this ufstype is read-only supported\n"); 1263 printk("this ufstype is read-only supported\n");
1264 unlock_super(sb);
1265 unlock_kernel();
1230 return -EINVAL; 1266 return -EINVAL;
1231 } 1267 }
1232 if (!ufs_read_cylinder_structures(sb)) { 1268 if (!ufs_read_cylinder_structures(sb)) {
1233 printk("failed during remounting\n"); 1269 printk("failed during remounting\n");
1270 unlock_super(sb);
1271 unlock_kernel();
1234 return -EPERM; 1272 return -EPERM;
1235 } 1273 }
1236 sb->s_flags &= ~MS_RDONLY; 1274 sb->s_flags &= ~MS_RDONLY;
1237#endif 1275#endif
1238 } 1276 }
1239 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1277 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1278 unlock_super(sb);
1279 unlock_kernel();
1240 return 0; 1280 return 0;
1241} 1281}
1242 1282
@@ -1352,6 +1392,7 @@ static const struct super_operations ufs_super_ops = {
1352 .delete_inode = ufs_delete_inode, 1392 .delete_inode = ufs_delete_inode,
1353 .put_super = ufs_put_super, 1393 .put_super = ufs_put_super,
1354 .write_super = ufs_write_super, 1394 .write_super = ufs_write_super,
1395 .sync_fs = ufs_sync_fs,
1355 .statfs = ufs_statfs, 1396 .statfs = ufs_statfs,
1356 .remount_fs = ufs_remount, 1397 .remount_fs = ufs_remount,
1357 .show_options = ufs_show_options, 1398 .show_options = ufs_show_options,
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index d0c4acd4f1f3..644e77e13599 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -99,7 +99,6 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
99extern const struct inode_operations ufs_file_inode_operations; 99extern const struct inode_operations ufs_file_inode_operations;
100extern const struct file_operations ufs_file_operations; 100extern const struct file_operations ufs_file_operations;
101extern const struct address_space_operations ufs_aops; 101extern const struct address_space_operations ufs_aops;
102extern int ufs_sync_file(struct file *, struct dentry *, int);
103 102
104/* ialloc.c */ 103/* ialloc.c */
105extern void ufs_free_inode (struct inode *inode); 104extern void ufs_free_inode (struct inode *inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index d51b8f9db921..1c3d0af59ddf 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
297 return error; 297 return error;
298 dentry = f->f_path.dentry; 298 dentry = f->f_path.dentry;
299 audit_inode(NULL, dentry); 299 audit_inode(NULL, dentry);
300 error = mnt_want_write(f->f_path.mnt); 300 error = mnt_want_write_file(f);
301 if (!error) { 301 if (!error) {
302 error = setxattr(dentry, name, value, size, flags); 302 error = setxattr(dentry, name, value, size, flags);
303 mnt_drop_write(f->f_path.mnt); 303 mnt_drop_write(f->f_path.mnt);
@@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
524 return error; 524 return error;
525 dentry = f->f_path.dentry; 525 dentry = f->f_path.dentry;
526 audit_inode(NULL, dentry); 526 audit_inode(NULL, dentry);
527 error = mnt_want_write(f->f_path.mnt); 527 error = mnt_want_write_file(f);
528 if (!error) { 528 if (!error) {
529 error = removexattr(dentry, name); 529 error = removexattr(dentry, name);
530 mnt_drop_write(f->f_path.mnt); 530 mnt_drop_write(f->f_path.mnt);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a9f2b5..1418b916fc27 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
1501 struct block_device *bdev) 1501 struct block_device *bdev)
1502{ 1502{
1503 return xfs_setsize_buftarg_flags(btp, 1503 return xfs_setsize_buftarg_flags(btp,
1504 PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); 1504 PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
1505} 1505}
1506 1506
1507int 1507int
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 36fb8a657c55..2e09efbca8db 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1121,15 +1121,6 @@ xfs_fs_put_super(
1121 kfree(mp); 1121 kfree(mp);
1122} 1122}
1123 1123
1124STATIC void
1125xfs_fs_write_super(
1126 struct super_block *sb)
1127{
1128 if (!(sb->s_flags & MS_RDONLY))
1129 xfs_sync_fsdata(XFS_M(sb), 0);
1130 sb->s_dirt = 0;
1131}
1132
1133STATIC int 1124STATIC int
1134xfs_fs_sync_super( 1125xfs_fs_sync_super(
1135 struct super_block *sb, 1126 struct super_block *sb,
@@ -1154,7 +1145,6 @@ xfs_fs_sync_super(
1154 error = xfs_quiesce_data(mp); 1145 error = xfs_quiesce_data(mp);
1155 else 1146 else
1156 error = xfs_sync_fsdata(mp, 0); 1147 error = xfs_sync_fsdata(mp, 0);
1157 sb->s_dirt = 0;
1158 1148
1159 if (unlikely(laptop_mode)) { 1149 if (unlikely(laptop_mode)) {
1160 int prev_sync_seq = mp->m_sync_seq; 1150 int prev_sync_seq = mp->m_sync_seq;
@@ -1461,7 +1451,6 @@ xfs_fs_fill_super(
1461 1451
1462 XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname); 1452 XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
1463 1453
1464 sb->s_dirt = 1;
1465 sb->s_magic = XFS_SB_MAGIC; 1454 sb->s_magic = XFS_SB_MAGIC;
1466 sb->s_blocksize = mp->m_sb.sb_blocksize; 1455 sb->s_blocksize = mp->m_sb.sb_blocksize;
1467 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1456 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1549,7 +1538,6 @@ static struct super_operations xfs_super_operations = {
1549 .write_inode = xfs_fs_write_inode, 1538 .write_inode = xfs_fs_write_inode,
1550 .clear_inode = xfs_fs_clear_inode, 1539 .clear_inode = xfs_fs_clear_inode,
1551 .put_super = xfs_fs_put_super, 1540 .put_super = xfs_fs_put_super,
1552 .write_super = xfs_fs_write_super,
1553 .sync_fs = xfs_fs_sync_super, 1541 .sync_fs = xfs_fs_sync_super,
1554 .freeze_fs = xfs_fs_freeze, 1542 .freeze_fs = xfs_fs_freeze,
1555 .statfs = xfs_fs_statfs, 1543 .statfs = xfs_fs_statfs,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fffabc0aefcb..66b849358e62 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -628,8 +628,6 @@ xfs_trans_apply_sb_deltas(
628 xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), 628 xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
629 offsetof(xfs_dsb_t, sb_frextents) + 629 offsetof(xfs_dsb_t, sb_frextents) +
630 sizeof(sbp->sb_frextents) - 1); 630 sizeof(sbp->sb_frextents) - 1);
631
632 tp->t_mountp->m_super->s_dirt = 1;
633} 631}
634 632
635/* 633/*