aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2010-06-26 13:27:00 -0400
committerDavid S. Miller <davem@davemloft.net>2010-06-26 13:27:00 -0400
commitc67dda14389205f0a223c5089307495290939b3b (patch)
treefad0bb26b28703d02a22ebdd44d94eabac4a2ade /fs
parent43bc2db47292a824152145253b1dd2847e7312a3 (diff)
parent7e27d6e778cd87b6f2415515d7127eba53fe5d02 (diff)
Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c6
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c3
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/affs/namei.c2
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/server.c5
-rw-r--r--fs/afs/write.c3
-rw-r--r--fs/aio.c71
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/attr.c50
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/dev-ioctl.c13
-rw-r--r--fs/bad_inode.c3
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/binfmt_elf_fdpic.c26
-rw-r--r--fs/binfmt_flat.c25
-rw-r--r--fs/block_dev.c88
-rw-r--r--fs/btrfs/acl.c8
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c109
-rw-r--r--fs/btrfs/ctree.h165
-rw-r--r--fs/btrfs/delayed-ref.c101
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c180
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2256
-rw-r--r--fs/btrfs/extent_io.c85
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/file-item.c28
-rw-r--r--fs/btrfs/file.c181
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1713
-rw-r--r--fs/btrfs/ioctl.c208
-rw-r--r--fs/btrfs/ordered-data.c82
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/relocation.c1974
-rw-r--r--fs/btrfs/root-tree.c26
-rw-r--r--fs/btrfs/super.c36
-rw-r--r--fs/btrfs/transaction.c232
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c241
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c17
-rw-r--r--fs/btrfs/xattr.c12
-rw-r--r--fs/buffer.c123
-rw-r--r--fs/ceph/auth.c7
-rw-r--r--fs/ceph/auth.h6
-rw-r--r--fs/ceph/auth_none.c8
-rw-r--r--fs/ceph/auth_x.c12
-rw-r--r--fs/ceph/caps.c97
-rw-r--r--fs/ceph/ceph_fs.h21
-rw-r--r--fs/ceph/dir.c7
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/mds_client.c49
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/messenger.c6
-rw-r--r--fs/ceph/messenger.h1
-rw-r--r--fs/ceph/mon_client.c7
-rw-r--r--fs/ceph/osd_client.c7
-rw-r--r--fs/ceph/osdmap.c2
-rw-r--r--fs/ceph/super.c16
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/file.c5
-rw-r--r--fs/coda/coda_int.h3
-rw-r--r--fs/coda/file.c4
-rw-r--r--fs/compat.c132
-rw-r--r--fs/configfs/inode.c14
-rw-r--r--fs/debugfs/file.c21
-rw-r--r--fs/direct-io.c123
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/exec.c195
-rw-r--r--fs/exofs/file.c7
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/file.c7
-rw-r--r--fs/ext2/inode.c153
-rw-r--r--fs/ext2/super.c20
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/fsync.c4
-rw-r--r--fs/ext3/super.c38
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c26
-rw-r--r--fs/ext4/ext4.h169
-rw-r--r--fs/ext4/ext4_jbd2.h8
-rw-r--r--fs/ext4/extents.c417
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c41
-rw-r--r--fs/ext4/ialloc.c89
-rw-r--r--fs/ext4/inode.c763
-rw-r--r--fs/ext4/ioctl.c27
-rw-r--r--fs/ext4/mballoc.c120
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c16
-rw-r--r--fs/ext4/namei.c61
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c117
-rw-r--r--fs/ext4/symlink.c2
-rw-r--r--fs/ext4/xattr.c39
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/file.c40
-rw-r--r--fs/fat/inode.c35
-rw-r--r--fs/fcntl.c7
-rw-r--r--fs/file_table.c21
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/fs-writeback.c64
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fscache/page.c36
-rw-r--r--fs/fuse/dev.c527
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/fuse/file.c48
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/file.c4
-rw-r--r--fs/gfs2/ops_inode.c5
-rw-r--r--fs/hostfs/hostfs_kern.c4
-rw-r--r--fs/hpfs/file.c4
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/jffs2/acl.c3
-rw-r--r--fs/jffs2/dir.c127
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/fs.c11
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/super.c16
-rw-r--r--fs/libfs.c111
-rw-r--r--fs/logfs/file.c4
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/minix/dir.c11
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/minix/itree_v2.c27
-rw-r--r--fs/namei.c2
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/ncpfs/file.c2
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/write.c20
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/nilfs2/btree.h2
-rw-r--r--fs/nilfs2/file.c4
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/segbuf.h2
-rw-r--r--fs/nilfs2/segment.h2
-rw-r--r--fs/nilfs2/super.c8
-rw-r--r--fs/ntfs/dir.c5
-rw-r--r--fs/ntfs/file.c9
-rw-r--r--fs/ocfs2/file.c15
-rw-r--r--fs/ocfs2/super.c50
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/pipe.c102
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c16
-rw-r--r--fs/proc/generic.c15
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/qnx4/dir.c3
-rw-r--r--fs/quota/dquot.c201
-rw-r--r--fs/quota/quota.c4
-rw-r--r--fs/ramfs/file-mmu.c3
-rw-r--r--fs/ramfs/file-nommu.c9
-rw-r--r--fs/read_write.c17
-rw-r--r--fs/reiserfs/dir.c9
-rw-r--r--fs/reiserfs/file.c5
-rw-r--r--fs/reiserfs/super.c48
-rw-r--r--fs/smbfs/dir.c1
-rw-r--r--fs/smbfs/file.c3
-rw-r--r--fs/smbfs/inode.c2
-rw-r--r--fs/splice.c2
-rw-r--r--fs/squashfs/Kconfig11
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/inode.c92
-rw-r--r--fs/squashfs/namei.c6
-rw-r--r--fs/squashfs/squashfs.h12
-rw-r--r--fs/squashfs/squashfs_fs.h76
-rw-r--r--fs/squashfs/squashfs_fs_i.h3
-rw-r--r--fs/squashfs/squashfs_fs_sb.h3
-rw-r--r--fs/squashfs/super.c30
-rw-r--r--fs/squashfs/symlink.c11
-rw-r--r--fs/squashfs/xattr.c323
-rw-r--r--fs/squashfs/xattr.h46
-rw-r--r--fs/squashfs/xattr_id.c100
-rw-r--r--fs/super.c17
-rw-r--r--fs/sync.c10
-rw-r--r--fs/sysfs/inode.c8
-rw-r--r--fs/sysv/dir.c2
-rw-r--r--fs/sysv/file.c2
-rw-r--r--fs/sysv/inode.c1
-rw-r--r--fs/ubifs/file.c17
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/udf/balloc.c43
-rw-r--r--fs/udf/dir.c3
-rw-r--r--fs/udf/file.c28
-rw-r--r--fs/udf/ialloc.c21
-rw-r--r--fs/udf/inode.c5
-rw-r--r--fs/udf/namei.c20
-rw-r--r--fs/udf/super.c13
-rw-r--r--fs/udf/udfdecl.h1
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c5
-rw-r--r--fs/ufs/ialloc.c13
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/namei.c16
-rw-r--r--fs/ufs/super.c112
-rw-r--r--fs/ufs/truncate.c20
-rw-r--r--fs/ufs/ufs_fs.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h356
-rw-r--r--fs/xfs/quota/xfs_qm.c4
-rw-r--r--fs/xfs/xfs_ag.h1
-rw-r--r--fs/xfs/xfs_iget.c29
-rw-r--r--fs/xfs/xfs_inode.c144
-rw-r--r--fs/xfs/xfs_log_recover.c11
-rw-r--r--fs/xfs/xfs_mount.c68
-rw-r--r--fs/xfs/xfs_rtalloc.c4
-rw-r--r--fs/xfs/xfs_rtalloc.h11
-rw-r--r--fs/xfs/xfs_trans.c446
-rw-r--r--fs/xfs/xfs_trans.h411
-rw-r--r--fs/xfs/xfs_vnodeops.c2
238 files changed, 9711 insertions, 5791 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 25b300e1c9d7..2bedc6c94fc2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -257,15 +257,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
257 return total; 257 return total;
258} 258}
259 259
260static int v9fs_file_fsync(struct file *filp, struct dentry *dentry, 260static int v9fs_file_fsync(struct file *filp, int datasync)
261 int datasync)
262{ 261{
263 struct p9_fid *fid; 262 struct p9_fid *fid;
264 struct p9_wstat wstat; 263 struct p9_wstat wstat;
265 int retval; 264 int retval;
266 265
267 P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp, 266 P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
268 dentry, datasync);
269 267
270 fid = filp->private_data; 268 fid = filp->private_data;
271 v9fs_blank_wstat(&wstat); 269 v9fs_blank_wstat(&wstat);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
197 .read = generic_read_dir, 197 .read = generic_read_dir,
198 .llseek = generic_file_llseek, 198 .llseek = generic_file_llseek,
199 .readdir = adfs_readdir, 199 .readdir = adfs_readdir,
200 .fsync = simple_fsync, 200 .fsync = generic_file_fsync,
201}; 201};
202 202
203static int 203static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
26 .read = do_sync_read, 26 .read = do_sync_read,
27 .aio_read = generic_file_aio_read, 27 .aio_read = generic_file_aio_read,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .write = do_sync_write, 30 .write = do_sync_write,
31 .aio_write = generic_file_aio_write, 31 .aio_write = generic_file_aio_write,
32 .splice_read = generic_file_splice_read, 32 .splice_read = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..6f850b06ab62 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
322 if (error) 322 if (error)
323 goto out; 323 goto out;
324 324
325 /* XXX: this is missing some actual on-disk truncation.. */
325 if (ia_valid & ATTR_SIZE) 326 if (ia_valid & ATTR_SIZE)
326 error = vmtruncate(inode, attr->ia_size); 327 error = simple_setsize(inode, attr->ia_size);
327 328
328 if (error) 329 if (error)
329 goto out; 330 goto out;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..f05b6155ccc8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -183,7 +183,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent
183 183
184void affs_free_prealloc(struct inode *inode); 184void affs_free_prealloc(struct inode *inode);
185extern void affs_truncate(struct inode *); 185extern void affs_truncate(struct inode *);
186int affs_file_fsync(struct file *, struct dentry *, int); 186int affs_file_fsync(struct file *, int);
187 187
188/* dir.c */ 188/* dir.c */
189 189
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..322710c3eedf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
916 affs_free_prealloc(inode); 916 affs_free_prealloc(inode);
917} 917}
918 918
919int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 919int affs_file_fsync(struct file *filp, int datasync)
920{ 920{
921 struct inode * inode = dentry->d_inode; 921 struct inode *inode = filp->f_mapping->host;
922 int ret, err; 922 int ret, err;
923 923
924 ret = write_inode_now(inode, 0); 924 ret = write_inode_now(inode, 0);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
224 affs_brelse(bh); 224 affs_brelse(bh);
225 inode = affs_iget(sb, ino); 225 inode = affs_iget(sb, ino);
226 if (IS_ERR(inode)) 226 if (IS_ERR(inode))
227 return ERR_PTR(PTR_ERR(inode)); 227 return ERR_CAST(inode);
228 } 228 }
229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations; 229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
230 d_add(dentry, inode); 230 d_add(dentry, inode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 807f284cc75e..5f679b77ce24 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -740,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
740extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 740extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
741 unsigned long, loff_t); 741 unsigned long, loff_t);
742extern int afs_writeback_all(struct afs_vnode *); 742extern int afs_writeback_all(struct afs_vnode *);
743extern int afs_fsync(struct file *, struct dentry *, int); 743extern int afs_fsync(struct file *, int);
744 744
745 745
746/*****************************************************************************/ 746/*****************************************************************************/
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f49099516675..9fdc7fe3a7bc 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
91 91
92 memcpy(&server->addr, addr, sizeof(struct in_addr)); 92 memcpy(&server->addr, addr, sizeof(struct in_addr));
93 server->addr.s_addr = addr->s_addr; 93 server->addr.s_addr = addr->s_addr;
94 _leave(" = %p{%d}", server, atomic_read(&server->usage));
95 } else {
96 _leave(" = NULL [nomem]");
94 } 97 }
95
96 _leave(" = %p{%d}", server, atomic_read(&server->usage));
97 return server; 98 return server;
98} 99}
99 100
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..3dab9e9948d0 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -701,8 +701,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
701 * - the return status from this call provides a reliable indication of 701 * - the return status from this call provides a reliable indication of
702 * whether any write errors occurred for this process. 702 * whether any write errors occurred for this process.
703 */ 703 */
704int afs_fsync(struct file *file, struct dentry *dentry, int datasync) 704int afs_fsync(struct file *file, int datasync)
705{ 705{
706 struct dentry *dentry = file->f_path.dentry;
706 struct afs_writeback *wb, *xwb; 707 struct afs_writeback *wb, *xwb;
707 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 708 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
708 int ret; 709 int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..1ccf25cef1f0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/mempool.h> 37#include <linux/mempool.h>
38#include <linux/hash.h> 38#include <linux/hash.h>
39#include <linux/compat.h>
39 40
40#include <asm/kmap_types.h> 41#include <asm/kmap_types.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
526 527
527 /* Complete the fput(s) */ 528 /* Complete the fput(s) */
528 if (req->ki_filp != NULL) 529 if (req->ki_filp != NULL)
529 __fput(req->ki_filp); 530 fput(req->ki_filp);
530 531
531 /* Link the iocb into the context's free list */ 532 /* Link the iocb into the context's free list */
532 spin_lock_irq(&ctx->ctx_lock); 533 spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
559 560
560 /* 561 /*
561 * Try to optimize the aio and eventfd file* puts, by avoiding to 562 * Try to optimize the aio and eventfd file* puts, by avoiding to
562 * schedule work in case it is not __fput() time. In normal cases, 563 * schedule work in case it is not final fput() time. In normal cases,
563 * we would not be holding the last reference to the file*, so 564 * we would not be holding the last reference to the file*, so
564 * this function will be executed w/out any aio kthread wakeup. 565 * this function will be executed w/out any aio kthread wakeup.
565 */ 566 */
566 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { 567 if (unlikely(!fput_atomic(req->ki_filp))) {
567 get_ioctx(ctx); 568 get_ioctx(ctx);
568 spin_lock(&fput_lock); 569 spin_lock(&fput_lock);
569 list_add(&req->ki_list, &fput_head); 570 list_add(&req->ki_list, &fput_head);
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
1384 return ret; 1385 return ret;
1385} 1386}
1386 1387
1387static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) 1388static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
1388{ 1389{
1389 ssize_t ret; 1390 ssize_t ret;
1390 1391
1391 ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, 1392#ifdef CONFIG_COMPAT
1392 kiocb->ki_nbytes, 1, 1393 if (compat)
1393 &kiocb->ki_inline_vec, &kiocb->ki_iovec); 1394 ret = compat_rw_copy_check_uvector(type,
1395 (struct compat_iovec __user *)kiocb->ki_buf,
1396 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1397 &kiocb->ki_iovec);
1398 else
1399#endif
1400 ret = rw_copy_check_uvector(type,
1401 (struct iovec __user *)kiocb->ki_buf,
1402 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1403 &kiocb->ki_iovec);
1394 if (ret < 0) 1404 if (ret < 0)
1395 goto out; 1405 goto out;
1396 1406
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
1420 * Performs the initial checks and aio retry method 1430 * Performs the initial checks and aio retry method
1421 * setup for the kiocb at the time of io submission. 1431 * setup for the kiocb at the time of io submission.
1422 */ 1432 */
1423static ssize_t aio_setup_iocb(struct kiocb *kiocb) 1433static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1424{ 1434{
1425 struct file *file = kiocb->ki_filp; 1435 struct file *file = kiocb->ki_filp;
1426 ssize_t ret = 0; 1436 ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1469 ret = security_file_permission(file, MAY_READ); 1479 ret = security_file_permission(file, MAY_READ);
1470 if (unlikely(ret)) 1480 if (unlikely(ret))
1471 break; 1481 break;
1472 ret = aio_setup_vectored_rw(READ, kiocb); 1482 ret = aio_setup_vectored_rw(READ, kiocb, compat);
1473 if (ret) 1483 if (ret)
1474 break; 1484 break;
1475 ret = -EINVAL; 1485 ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1483 ret = security_file_permission(file, MAY_WRITE); 1493 ret = security_file_permission(file, MAY_WRITE);
1484 if (unlikely(ret)) 1494 if (unlikely(ret))
1485 break; 1495 break;
1486 ret = aio_setup_vectored_rw(WRITE, kiocb); 1496 ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
1487 if (ret) 1497 if (ret)
1488 break; 1498 break;
1489 ret = -EINVAL; 1499 ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
1548} 1558}
1549 1559
1550static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1560static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1551 struct iocb *iocb, struct hlist_head *batch_hash) 1561 struct iocb *iocb, struct hlist_head *batch_hash,
1562 bool compat)
1552{ 1563{
1553 struct kiocb *req; 1564 struct kiocb *req;
1554 struct file *file; 1565 struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1609 req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1620 req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
1610 req->ki_opcode = iocb->aio_lio_opcode; 1621 req->ki_opcode = iocb->aio_lio_opcode;
1611 1622
1612 ret = aio_setup_iocb(req); 1623 ret = aio_setup_iocb(req, compat);
1613 1624
1614 if (ret) 1625 if (ret)
1615 goto out_put_req; 1626 goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
1637 return ret; 1648 return ret;
1638} 1649}
1639 1650
1640/* sys_io_submit: 1651long do_io_submit(aio_context_t ctx_id, long nr,
1641 * Queue the nr iocbs pointed to by iocbpp for processing. Returns 1652 struct iocb __user *__user *iocbpp, bool compat)
1642 * the number of iocbs queued. May return -EINVAL if the aio_context
1643 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1644 * *iocbpp[0] is not properly initialized, if the operation specified
1645 * is invalid for the file descriptor in the iocb. May fail with
1646 * -EFAULT if any of the data structures point to invalid data. May
1647 * fail with -EBADF if the file descriptor specified in the first
1648 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1649 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1650 * fail with -ENOSYS if not implemented.
1651 */
1652SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1653 struct iocb __user * __user *, iocbpp)
1654{ 1653{
1655 struct kioctx *ctx; 1654 struct kioctx *ctx;
1656 long ret = 0; 1655 long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1687 break; 1686 break;
1688 } 1687 }
1689 1688
1690 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); 1689 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
1691 if (ret) 1690 if (ret)
1692 break; 1691 break;
1693 } 1692 }
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1697 return i ? i : ret; 1696 return i ? i : ret;
1698} 1697}
1699 1698
1699/* sys_io_submit:
1700 * Queue the nr iocbs pointed to by iocbpp for processing. Returns
1701 * the number of iocbs queued. May return -EINVAL if the aio_context
1702 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1703 * *iocbpp[0] is not properly initialized, if the operation specified
1704 * is invalid for the file descriptor in the iocb. May fail with
1705 * -EFAULT if any of the data structures point to invalid data. May
1706 * fail with -EBADF if the file descriptor specified in the first
1707 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1708 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1709 * fail with -ENOSYS if not implemented.
1710 */
1711SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1712 struct iocb __user * __user *, iocbpp)
1713{
1714 return do_io_submit(ctx_id, nr, iocbpp, 0);
1715}
1716
1700/* lookup_kiocb 1717/* lookup_kiocb
1701 * Finds a given iocb for cancellation. 1718 * Finds a given iocb for cancellation.
1702 */ 1719 */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9bd4b3876c99..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void)
205 * that it already _is_ on the dirty list. 205 * that it already _is_ on the dirty list.
206 */ 206 */
207 inode->i_state = I_DIRTY; 207 inode->i_state = I_DIRTY;
208 inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; 208 inode->i_mode = S_IRUSR | S_IWUSR;
209 inode->i_uid = current_fsuid(); 209 inode->i_uid = current_fsuid();
210 inode->i_gid = current_fsgid(); 210 inode->i_gid = current_fsgid();
211 inode->i_flags |= S_PRIVATE; 211 inode->i_flags |= S_PRIVATE;
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
67 * @offset: the new size to assign to the inode 67 * @offset: the new size to assign to the inode
68 * @Returns: 0 on success, -ve errno on failure 68 * @Returns: 0 on success, -ve errno on failure
69 * 69 *
70 * inode_newsize_ok must be called with i_mutex held.
71 *
70 * inode_newsize_ok will check filesystem limits and ulimits to check that the 72 * inode_newsize_ok will check filesystem limits and ulimits to check that the
71 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ 73 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
72 * when necessary. Caller must not proceed with inode size change if failure is 74 * when necessary. Caller must not proceed with inode size change if failure is
73 * returned. @inode must be a file (not directory), with appropriate 75 * returned. @inode must be a file (not directory), with appropriate
74 * permissions to allow truncate (inode_newsize_ok does NOT check these 76 * permissions to allow truncate (inode_newsize_ok does NOT check these
75 * conditions). 77 * conditions).
76 *
77 * inode_newsize_ok must be called with i_mutex held.
78 */ 78 */
79int inode_newsize_ok(const struct inode *inode, loff_t offset) 79int inode_newsize_ok(const struct inode *inode, loff_t offset)
80{ 80{
@@ -104,17 +104,25 @@ out_big:
104} 104}
105EXPORT_SYMBOL(inode_newsize_ok); 105EXPORT_SYMBOL(inode_newsize_ok);
106 106
107int inode_setattr(struct inode * inode, struct iattr * attr) 107/**
108 * generic_setattr - copy simple metadata updates into the generic inode
109 * @inode: the inode to be updated
110 * @attr: the new attributes
111 *
112 * generic_setattr must be called with i_mutex held.
113 *
114 * generic_setattr updates the inode's metadata with that specified
115 * in attr. Noticably missing is inode size update, which is more complex
116 * as it requires pagecache updates. See simple_setsize.
117 *
118 * The inode is not marked as dirty after this operation. The rationale is
119 * that for "simple" filesystems, the struct inode is the inode storage.
120 * The caller is free to mark the inode dirty afterwards if needed.
121 */
122void generic_setattr(struct inode *inode, const struct iattr *attr)
108{ 123{
109 unsigned int ia_valid = attr->ia_valid; 124 unsigned int ia_valid = attr->ia_valid;
110 125
111 if (ia_valid & ATTR_SIZE &&
112 attr->ia_size != i_size_read(inode)) {
113 int error = vmtruncate(inode, attr->ia_size);
114 if (error)
115 return error;
116 }
117
118 if (ia_valid & ATTR_UID) 126 if (ia_valid & ATTR_UID)
119 inode->i_uid = attr->ia_uid; 127 inode->i_uid = attr->ia_uid;
120 if (ia_valid & ATTR_GID) 128 if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
135 mode &= ~S_ISGID; 143 mode &= ~S_ISGID;
136 inode->i_mode = mode; 144 inode->i_mode = mode;
137 } 145 }
146}
147EXPORT_SYMBOL(generic_setattr);
148
149/*
150 * note this function is deprecated, the new truncate sequence should be
151 * used instead -- see eg. simple_setsize, generic_setattr.
152 */
153int inode_setattr(struct inode *inode, const struct iattr *attr)
154{
155 unsigned int ia_valid = attr->ia_valid;
156
157 if (ia_valid & ATTR_SIZE &&
158 attr->ia_size != i_size_read(inode)) {
159 int error;
160
161 error = vmtruncate(inode, attr->ia_size);
162 if (error)
163 return error;
164 }
165
166 generic_setattr(inode, attr);
167
138 mark_inode_dirty(inode); 168 mark_inode_dirty(inode);
139 169
140 return 0; 170 return 0;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
29 29
30const struct file_operations autofs_root_operations = { 30const struct file_operations autofs_root_operations = {
31 .llseek = generic_file_llseek,
31 .read = generic_read_dir, 32 .read = generic_read_dir,
32 .readdir = autofs_root_readdir, 33 .readdir = autofs_root_readdir,
33 .ioctl = autofs_root_ioctl, 34 .ioctl = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d832062869f6..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
95 */ 95 */
96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) 96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
97{ 97{
98 struct autofs_dev_ioctl tmp, *ads; 98 struct autofs_dev_ioctl tmp;
99 99
100 if (copy_from_user(&tmp, in, sizeof(tmp))) 100 if (copy_from_user(&tmp, in, sizeof(tmp)))
101 return ERR_PTR(-EFAULT); 101 return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
103 if (tmp.size < sizeof(tmp)) 103 if (tmp.size < sizeof(tmp))
104 return ERR_PTR(-EINVAL); 104 return ERR_PTR(-EINVAL);
105 105
106 ads = kmalloc(tmp.size, GFP_KERNEL); 106 return memdup_user(in, tmp.size);
107 if (!ads)
108 return ERR_PTR(-ENOMEM);
109
110 if (copy_from_user(ads, in, tmp.size)) {
111 kfree(ads);
112 return ERR_PTR(-EFAULT);
113 }
114
115 return ads;
116} 107}
117 108
118static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) 109static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..52e59bf4aa5f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
93 return -EIO; 93 return -EIO;
94} 94}
95 95
96static int bad_file_fsync(struct file *file, struct dentry *dentry, 96static int bad_file_fsync(struct file *file, int datasync)
97 int datasync)
98{ 97{
99 return -EIO; 98 return -EIO;
100} 99}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 8f73841fc974..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
78const struct file_operations bfs_dir_operations = { 78const struct file_operations bfs_dir_operations = {
79 .read = generic_read_dir, 79 .read = generic_read_dir,
80 .readdir = bfs_readdir, 80 .readdir = bfs_readdir,
81 .fsync = simple_fsync, 81 .fsync = generic_file_fsync,
82 .llseek = generic_file_llseek, 82 .llseek = generic_file_llseek,
83}; 83};
84 84
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c5f9a0e5d72..63039ed9576f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
990 990
991 /* clear any space allocated but not loaded */ 991 /* clear any space allocated but not loaded */
992 if (phdr->p_filesz < phdr->p_memsz) { 992 if (phdr->p_filesz < phdr->p_memsz) {
993 ret = clear_user((void *) (seg->addr + phdr->p_filesz), 993 if (clear_user((void *) (seg->addr + phdr->p_filesz),
994 phdr->p_memsz - phdr->p_filesz); 994 phdr->p_memsz - phdr->p_filesz))
995 if (ret) 995 return -EFAULT;
996 return ret;
997 } 996 }
998 997
999 if (mm) { 998 if (mm) {
@@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1027 struct elf32_fdpic_loadseg *seg; 1026 struct elf32_fdpic_loadseg *seg;
1028 struct elf32_phdr *phdr; 1027 struct elf32_phdr *phdr;
1029 unsigned long load_addr, delta_vaddr; 1028 unsigned long load_addr, delta_vaddr;
1030 int loop, dvset, ret; 1029 int loop, dvset;
1031 1030
1032 load_addr = params->load_addr; 1031 load_addr = params->load_addr;
1033 delta_vaddr = 0; 1032 delta_vaddr = 0;
@@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1127 * PT_LOAD */ 1126 * PT_LOAD */
1128 if (prot & PROT_WRITE && disp > 0) { 1127 if (prot & PROT_WRITE && disp > 0) {
1129 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); 1128 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
1130 ret = clear_user((void __user *) maddr, disp); 1129 if (clear_user((void __user *) maddr, disp))
1131 if (ret) 1130 return -EFAULT;
1132 return ret;
1133 maddr += disp; 1131 maddr += disp;
1134 } 1132 }
1135 1133
@@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1164 if (prot & PROT_WRITE && excess1 > 0) { 1162 if (prot & PROT_WRITE && excess1 > 0) {
1165 kdebug("clear[%d] ad=%lx sz=%lx", 1163 kdebug("clear[%d] ad=%lx sz=%lx",
1166 loop, maddr + phdr->p_filesz, excess1); 1164 loop, maddr + phdr->p_filesz, excess1);
1167 ret = clear_user((void __user *) maddr + phdr->p_filesz, 1165 if (clear_user((void __user *) maddr + phdr->p_filesz,
1168 excess1); 1166 excess1))
1169 if (ret) 1167 return -EFAULT;
1170 return ret;
1171 } 1168 }
1172 1169
1173#else 1170#else
1174 if (excess > 0) { 1171 if (excess > 0) {
1175 kdebug("clear[%d] ad=%lx sz=%lx", 1172 kdebug("clear[%d] ad=%lx sz=%lx",
1176 loop, maddr + phdr->p_filesz, excess); 1173 loop, maddr + phdr->p_filesz, excess);
1177 ret = clear_user((void *) maddr + phdr->p_filesz, excess); 1174 if (clear_user((void *) maddr + phdr->p_filesz, excess))
1178 if (ret) 1175 return -EFAULT;
1179 return ret;
1180 } 1176 }
1181#endif 1177#endif
1182 1178
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 49566c1687d8..b6ab27ccf214 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -56,15 +56,22 @@
56#endif 56#endif
57 57
58/* 58/*
59 * User data (stack, data section and bss) needs to be aligned 59 * User data (data section and bss) needs to be aligned.
60 * for the same reasons as SLAB memory is, and to the same amount. 60 * We pick 0x20 here because it is the max value elf2flt has always
61 * Avoid duplicating architecture specific code by using the same 61 * used in producing FLAT files, and because it seems to be large
62 * macro as with SLAB allocation: 62 * enough to make all the gcc alignment related tests happy.
63 */
64#define FLAT_DATA_ALIGN (0x20)
65
66/*
67 * User data (stack) also needs to be aligned.
68 * Here we can be a bit looser than the data sections since this
69 * needs to only meet arch ABI requirements.
63 */ 70 */
64#ifdef ARCH_SLAB_MINALIGN 71#ifdef ARCH_SLAB_MINALIGN
65#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN) 72#define FLAT_STACK_ALIGN (ARCH_SLAB_MINALIGN)
66#else 73#else
67#define FLAT_DATA_ALIGN (sizeof(void *)) 74#define FLAT_STACK_ALIGN (sizeof(void *))
68#endif 75#endif
69 76
70#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ 77#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */
@@ -129,7 +136,7 @@ static unsigned long create_flat_tables(
129 136
130 sp = (unsigned long *)p; 137 sp = (unsigned long *)p;
131 sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); 138 sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
132 sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN); 139 sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
133 argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); 140 argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
134 envp = argv + (argc + 1); 141 envp = argv + (argc + 1);
135 142
@@ -589,7 +596,7 @@ static int load_flat_file(struct linux_binprm * bprm,
589 if (IS_ERR_VALUE(result)) { 596 if (IS_ERR_VALUE(result)) {
590 printk("Unable to read data+bss, errno %d\n", (int)-result); 597 printk("Unable to read data+bss, errno %d\n", (int)-result);
591 do_munmap(current->mm, textpos, text_len); 598 do_munmap(current->mm, textpos, text_len);
592 do_munmap(current->mm, realdatastart, data_len + extra); 599 do_munmap(current->mm, realdatastart, len);
593 ret = result; 600 ret = result;
594 goto err; 601 goto err;
595 } 602 }
@@ -876,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
876 stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ 883 stack_len = TOP_OF_ARGS - bprm->p; /* the strings */
877 stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ 884 stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
878 stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ 885 stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
879 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ 886 stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */
880 887
881 res = load_flat_file(bprm, &libinfo, 0, &stack_len); 888 res = load_flat_file(bprm, &libinfo, 0, &stack_len);
882 if (IS_ERR_VALUE(res)) 889 if (IS_ERR_VALUE(res))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 26e5f5026620..99d6af811747 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
172 struct file *file = iocb->ki_filp; 172 struct file *file = iocb->ki_filp;
173 struct inode *inode = file->f_mapping->host; 173 struct inode *inode = file->f_mapping->host;
174 174
175 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 175 return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
176 iov, offset, nr_segs, blkdev_get_blocks, NULL); 176 I_BDEV(inode), iov, offset, nr_segs,
177 blkdev_get_blocks, NULL);
177} 178}
178 179
179int __sync_blockdev(struct block_device *bdev, int wait) 180int __sync_blockdev(struct block_device *bdev, int wait)
@@ -309,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
309 struct page **pagep, void **fsdata) 310 struct page **pagep, void **fsdata)
310{ 311{
311 *pagep = NULL; 312 *pagep = NULL;
312 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 313 return block_write_begin_newtrunc(file, mapping, pos, len, flags,
313 blkdev_get_block); 314 pagep, fsdata, blkdev_get_block);
314} 315}
315 316
316static int blkdev_write_end(struct file *file, struct address_space *mapping, 317static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -358,12 +359,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
358 return retval; 359 return retval;
359} 360}
360 361
361/* 362int blkdev_fsync(struct file *filp, int datasync)
362 * Filp is never NULL; the only case when ->fsync() is called with
363 * NULL first argument is nfsd_sync_dir() and that's not a directory.
364 */
365
366int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
367{ 363{
368 struct inode *bd_inode = filp->f_mapping->host; 364 struct inode *bd_inode = filp->f_mapping->host;
369 struct block_device *bdev = I_BDEV(bd_inode); 365 struct block_device *bdev = I_BDEV(bd_inode);
@@ -710,8 +706,13 @@ retry:
710 * @bdev is about to be opened exclusively. Check @bdev can be opened 706 * @bdev is about to be opened exclusively. Check @bdev can be opened
711 * exclusively and mark that an exclusive open is in progress. Each 707 * exclusively and mark that an exclusive open is in progress. Each
712 * successful call to this function must be matched with a call to 708 * successful call to this function must be matched with a call to
713 * either bd_claim() or bd_abort_claiming(). If this function 709 * either bd_finish_claiming() or bd_abort_claiming() (which do not
714 * succeeds, the matching bd_claim() is guaranteed to succeed. 710 * fail).
711 *
712 * This function is used to gain exclusive access to the block device
713 * without actually causing other exclusive open attempts to fail. It
714 * should be used when the open sequence itself requires exclusive
715 * access but may subsequently fail.
715 * 716 *
716 * CONTEXT: 717 * CONTEXT:
717 * Might sleep. 718 * Might sleep.
@@ -738,6 +739,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
738 return ERR_PTR(-ENXIO); 739 return ERR_PTR(-ENXIO);
739 740
740 whole = bdget_disk(disk, 0); 741 whole = bdget_disk(disk, 0);
742 module_put(disk->fops->owner);
741 put_disk(disk); 743 put_disk(disk);
742 if (!whole) 744 if (!whole)
743 return ERR_PTR(-ENOMEM); 745 return ERR_PTR(-ENOMEM);
@@ -786,15 +788,46 @@ static void bd_abort_claiming(struct block_device *whole, void *holder)
786 __bd_abort_claiming(whole, holder); /* releases bdev_lock */ 788 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
787} 789}
788 790
791/* increment holders when we have a legitimate claim. requires bdev_lock */
792static void __bd_claim(struct block_device *bdev, struct block_device *whole,
793 void *holder)
794{
795 /* note that for a whole device bd_holders
796 * will be incremented twice, and bd_holder will
797 * be set to bd_claim before being set to holder
798 */
799 whole->bd_holders++;
800 whole->bd_holder = bd_claim;
801 bdev->bd_holders++;
802 bdev->bd_holder = holder;
803}
804
805/**
806 * bd_finish_claiming - finish claiming a block device
807 * @bdev: block device of interest (passed to bd_start_claiming())
808 * @whole: whole block device returned by bd_start_claiming()
809 * @holder: holder trying to claim @bdev
810 *
811 * Finish a claiming block started by bd_start_claiming().
812 *
813 * CONTEXT:
814 * Grabs and releases bdev_lock.
815 */
816static void bd_finish_claiming(struct block_device *bdev,
817 struct block_device *whole, void *holder)
818{
819 spin_lock(&bdev_lock);
820 BUG_ON(!bd_may_claim(bdev, whole, holder));
821 __bd_claim(bdev, whole, holder);
822 __bd_abort_claiming(whole, holder); /* not actually an abort */
823}
824
789/** 825/**
790 * bd_claim - claim a block device 826 * bd_claim - claim a block device
791 * @bdev: block device to claim 827 * @bdev: block device to claim
792 * @holder: holder trying to claim @bdev 828 * @holder: holder trying to claim @bdev
793 * 829 *
794 * Try to claim @bdev which must have been opened successfully. This 830 * Try to claim @bdev which must have been opened successfully.
795 * function may be called with or without preceding
796 * blk_start_claiming(). In the former case, this function is always
797 * successful and terminates the claiming block.
798 * 831 *
799 * CONTEXT: 832 * CONTEXT:
800 * Might sleep. 833 * Might sleep.
@@ -810,23 +843,10 @@ int bd_claim(struct block_device *bdev, void *holder)
810 might_sleep(); 843 might_sleep();
811 844
812 spin_lock(&bdev_lock); 845 spin_lock(&bdev_lock);
813
814 res = bd_prepare_to_claim(bdev, whole, holder); 846 res = bd_prepare_to_claim(bdev, whole, holder);
815 if (res == 0) { 847 if (res == 0)
816 /* note that for a whole device bd_holders 848 __bd_claim(bdev, whole, holder);
817 * will be incremented twice, and bd_holder will 849 spin_unlock(&bdev_lock);
818 * be set to bd_claim before being set to holder
819 */
820 whole->bd_holders++;
821 whole->bd_holder = bd_claim;
822 bdev->bd_holders++;
823 bdev->bd_holder = holder;
824 }
825
826 if (whole->bd_claiming)
827 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
828 else
829 spin_unlock(&bdev_lock);
830 850
831 return res; 851 return res;
832} 852}
@@ -1480,7 +1500,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1480 1500
1481 if (whole) { 1501 if (whole) {
1482 if (res == 0) 1502 if (res == 0)
1483 BUG_ON(bd_claim(bdev, filp) != 0); 1503 bd_finish_claiming(bdev, whole, filp);
1484 else 1504 else
1485 bd_abort_claiming(whole, filp); 1505 bd_abort_claiming(whole, filp);
1486 } 1506 }
@@ -1716,7 +1736,7 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h
1716 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1736 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1717 goto out_blkdev_put; 1737 goto out_blkdev_put;
1718 1738
1719 BUG_ON(bd_claim(bdev, holder) != 0); 1739 bd_finish_claiming(bdev, whole, holder);
1720 return bdev; 1740 return bdev;
1721 1741
1722out_blkdev_put: 1742out_blkdev_put:
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 8d432cd9d580..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 61 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl))
64 return acl;
63 set_cached_acl(inode, type, acl); 65 set_cached_acl(inode, type, acl);
64 } 66 }
65 kfree(value); 67 kfree(value);
@@ -160,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
160 int ret; 162 int ret;
161 struct posix_acl *acl = NULL; 163 struct posix_acl *acl = NULL;
162 164
165 if (!is_owner_or_cap(dentry->d_inode))
166 return -EPERM;
167
168 if (!IS_POSIXACL(dentry->d_inode))
169 return -EOPNOTSUPP;
170
163 if (value) { 171 if (value) {
164 acl = posix_acl_from_xattr(value, size); 172 acl = posix_acl_from_xattr(value, size);
165 if (acl == NULL) { 173 if (acl == NULL) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
377 if (!list_empty(&worker->pending) || 377 if (!list_empty(&worker->pending) ||
378 !list_empty(&worker->prio_pending)) { 378 !list_empty(&worker->prio_pending)) {
379 spin_unlock_irq(&worker->lock); 379 spin_unlock_irq(&worker->lock);
380 set_current_state(TASK_RUNNING);
380 goto again; 381 goto again;
381 } 382 }
382 383
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock; 139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents;
140 int reserved_extents; 141 int reserved_extents;
141 int outstanding_extents;
142 142
143 /* 143 /*
144 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
151 * of these. 151 * of these.
152 */ 152 */
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1;
154 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
155 156
156 /* 157 /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..0d1d966b0fe4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
281 struct btrfs_root *root, 281 struct btrfs_root *root,
282 struct extent_buffer *buf, 282 struct extent_buffer *buf,
283 struct extent_buffer *cow) 283 struct extent_buffer *cow,
284 int *last_ref)
284{ 285{
285 u64 refs; 286 u64 refs;
286 u64 owner; 287 u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
366 BUG_ON(ret); 367 BUG_ON(ret);
367 } 368 }
368 clean_tree_block(trans, root, buf); 369 clean_tree_block(trans, root, buf);
370 *last_ref = 1;
369 } 371 }
370 return 0; 372 return 0;
371} 373}
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
392 struct btrfs_disk_key disk_key; 394 struct btrfs_disk_key disk_key;
393 struct extent_buffer *cow; 395 struct extent_buffer *cow;
394 int level; 396 int level;
397 int last_ref = 0;
395 int unlock_orig = 0; 398 int unlock_orig = 0;
396 u64 parent_start; 399 u64 parent_start;
397 400
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
442 (unsigned long)btrfs_header_fsid(cow), 445 (unsigned long)btrfs_header_fsid(cow),
443 BTRFS_FSID_SIZE); 446 BTRFS_FSID_SIZE);
444 447
445 update_ref_for_cow(trans, root, buf, cow); 448 update_ref_for_cow(trans, root, buf, cow, &last_ref);
449
450 if (root->ref_cows)
451 btrfs_reloc_cow_block(trans, root, buf, cow);
446 452
447 if (buf == root->node) { 453 if (buf == root->node) {
448 WARN_ON(parent && parent != buf); 454 WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
457 extent_buffer_get(cow); 463 extent_buffer_get(cow);
458 spin_unlock(&root->node_lock); 464 spin_unlock(&root->node_lock);
459 465
460 btrfs_free_tree_block(trans, root, buf->start, buf->len, 466 btrfs_free_tree_block(trans, root, buf, parent_start,
461 parent_start, root->root_key.objectid, level); 467 last_ref);
462 free_extent_buffer(buf); 468 free_extent_buffer(buf);
463 add_root_to_dirty_list(root); 469 add_root_to_dirty_list(root);
464 } else { 470 } else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
473 btrfs_set_node_ptr_generation(parent, parent_slot, 479 btrfs_set_node_ptr_generation(parent, parent_slot,
474 trans->transid); 480 trans->transid);
475 btrfs_mark_buffer_dirty(parent); 481 btrfs_mark_buffer_dirty(parent);
476 btrfs_free_tree_block(trans, root, buf->start, buf->len, 482 btrfs_free_tree_block(trans, root, buf, parent_start,
477 parent_start, root->root_key.objectid, level); 483 last_ref);
478 } 484 }
479 if (unlock_orig) 485 if (unlock_orig)
480 btrfs_tree_unlock(buf); 486 btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
949 return bin_search(eb, key, level, slot); 955 return bin_search(eb, key, level, slot);
950} 956}
951 957
958static void root_add_used(struct btrfs_root *root, u32 size)
959{
960 spin_lock(&root->accounting_lock);
961 btrfs_set_root_used(&root->root_item,
962 btrfs_root_used(&root->root_item) + size);
963 spin_unlock(&root->accounting_lock);
964}
965
966static void root_sub_used(struct btrfs_root *root, u32 size)
967{
968 spin_lock(&root->accounting_lock);
969 btrfs_set_root_used(&root->root_item,
970 btrfs_root_used(&root->root_item) - size);
971 spin_unlock(&root->accounting_lock);
972}
973
952/* given a node and slot number, this reads the blocks it points to. The 974/* given a node and slot number, this reads the blocks it points to. The
953 * extent buffer is returned with a reference taken (but unlocked). 975 * extent buffer is returned with a reference taken (but unlocked).
954 * NULL is returned on error. 976 * NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1019 btrfs_tree_lock(child); 1041 btrfs_tree_lock(child);
1020 btrfs_set_lock_blocking(child); 1042 btrfs_set_lock_blocking(child);
1021 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1043 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1022 BUG_ON(ret); 1044 if (ret) {
1045 btrfs_tree_unlock(child);
1046 free_extent_buffer(child);
1047 goto enospc;
1048 }
1023 1049
1024 spin_lock(&root->node_lock); 1050 spin_lock(&root->node_lock);
1025 root->node = child; 1051 root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1034 btrfs_tree_unlock(mid); 1060 btrfs_tree_unlock(mid);
1035 /* once for the path */ 1061 /* once for the path */
1036 free_extent_buffer(mid); 1062 free_extent_buffer(mid);
1037 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, 1063
1038 0, root->root_key.objectid, level); 1064 root_sub_used(root, mid->len);
1065 btrfs_free_tree_block(trans, root, mid, 0, 1);
1039 /* once for the root ptr */ 1066 /* once for the root ptr */
1040 free_extent_buffer(mid); 1067 free_extent_buffer(mid);
1041 return ret; 1068 return 0;
1042 } 1069 }
1043 if (btrfs_header_nritems(mid) > 1070 if (btrfs_header_nritems(mid) >
1044 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1088 if (wret < 0 && wret != -ENOSPC) 1115 if (wret < 0 && wret != -ENOSPC)
1089 ret = wret; 1116 ret = wret;
1090 if (btrfs_header_nritems(right) == 0) { 1117 if (btrfs_header_nritems(right) == 0) {
1091 u64 bytenr = right->start;
1092 u32 blocksize = right->len;
1093
1094 clean_tree_block(trans, root, right); 1118 clean_tree_block(trans, root, right);
1095 btrfs_tree_unlock(right); 1119 btrfs_tree_unlock(right);
1096 free_extent_buffer(right);
1097 right = NULL;
1098 wret = del_ptr(trans, root, path, level + 1, pslot + 1120 wret = del_ptr(trans, root, path, level + 1, pslot +
1099 1); 1121 1);
1100 if (wret) 1122 if (wret)
1101 ret = wret; 1123 ret = wret;
1102 wret = btrfs_free_tree_block(trans, root, 1124 root_sub_used(root, right->len);
1103 bytenr, blocksize, 0, 1125 btrfs_free_tree_block(trans, root, right, 0, 1);
1104 root->root_key.objectid, 1126 free_extent_buffer(right);
1105 level); 1127 right = NULL;
1106 if (wret)
1107 ret = wret;
1108 } else { 1128 } else {
1109 struct btrfs_disk_key right_key; 1129 struct btrfs_disk_key right_key;
1110 btrfs_node_key(right, &right_key, 0); 1130 btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1136 BUG_ON(wret == 1); 1156 BUG_ON(wret == 1);
1137 } 1157 }
1138 if (btrfs_header_nritems(mid) == 0) { 1158 if (btrfs_header_nritems(mid) == 0) {
1139 /* we've managed to empty the middle node, drop it */
1140 u64 bytenr = mid->start;
1141 u32 blocksize = mid->len;
1142
1143 clean_tree_block(trans, root, mid); 1159 clean_tree_block(trans, root, mid);
1144 btrfs_tree_unlock(mid); 1160 btrfs_tree_unlock(mid);
1145 free_extent_buffer(mid);
1146 mid = NULL;
1147 wret = del_ptr(trans, root, path, level + 1, pslot); 1161 wret = del_ptr(trans, root, path, level + 1, pslot);
1148 if (wret) 1162 if (wret)
1149 ret = wret; 1163 ret = wret;
1150 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, 1164 root_sub_used(root, mid->len);
1151 0, root->root_key.objectid, level); 1165 btrfs_free_tree_block(trans, root, mid, 0, 1);
1152 if (wret) 1166 free_extent_buffer(mid);
1153 ret = wret; 1167 mid = NULL;
1154 } else { 1168 } else {
1155 /* update the parent key to reflect our changes */ 1169 /* update the parent key to reflect our changes */
1156 struct btrfs_disk_key mid_key; 1170 struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1590 btrfs_release_path(NULL, p); 1604 btrfs_release_path(NULL, p);
1591 1605
1592 ret = -EAGAIN; 1606 ret = -EAGAIN;
1593 tmp = read_tree_block(root, blocknr, blocksize, gen); 1607 tmp = read_tree_block(root, blocknr, blocksize, 0);
1594 if (tmp) { 1608 if (tmp) {
1595 /* 1609 /*
1596 * If the read above didn't mark this buffer up to date, 1610 * If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
1740 p->nodes[level + 1], 1754 p->nodes[level + 1],
1741 p->slots[level + 1], &b); 1755 p->slots[level + 1], &b);
1742 if (err) { 1756 if (err) {
1743 free_extent_buffer(b);
1744 ret = err; 1757 ret = err;
1745 goto done; 1758 goto done;
1746 } 1759 }
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2076 if (IS_ERR(c)) 2089 if (IS_ERR(c))
2077 return PTR_ERR(c); 2090 return PTR_ERR(c);
2078 2091
2092 root_add_used(root, root->nodesize);
2093
2079 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 2094 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2080 btrfs_set_header_nritems(c, 1); 2095 btrfs_set_header_nritems(c, 1);
2081 btrfs_set_header_level(c, level); 2096 btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2134 int nritems; 2149 int nritems;
2135 2150
2136 BUG_ON(!path->nodes[level]); 2151 BUG_ON(!path->nodes[level]);
2152 btrfs_assert_tree_locked(path->nodes[level]);
2137 lower = path->nodes[level]; 2153 lower = path->nodes[level];
2138 nritems = btrfs_header_nritems(lower); 2154 nritems = btrfs_header_nritems(lower);
2139 BUG_ON(slot > nritems); 2155 BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2202 if (IS_ERR(split)) 2218 if (IS_ERR(split))
2203 return PTR_ERR(split); 2219 return PTR_ERR(split);
2204 2220
2221 root_add_used(root, root->nodesize);
2222
2205 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 2223 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2206 btrfs_set_header_level(split, btrfs_header_level(c)); 2224 btrfs_set_header_level(split, btrfs_header_level(c));
2207 btrfs_set_header_bytenr(split, split->start); 2225 btrfs_set_header_bytenr(split, split->start);
@@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2415 2433
2416 if (left_nritems) 2434 if (left_nritems)
2417 btrfs_mark_buffer_dirty(left); 2435 btrfs_mark_buffer_dirty(left);
2436 else
2437 clean_tree_block(trans, root, left);
2438
2418 btrfs_mark_buffer_dirty(right); 2439 btrfs_mark_buffer_dirty(right);
2419 2440
2420 btrfs_item_key(right, &disk_key, 0); 2441 btrfs_item_key(right, &disk_key, 0);
@@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2660 btrfs_mark_buffer_dirty(left); 2681 btrfs_mark_buffer_dirty(left);
2661 if (right_nritems) 2682 if (right_nritems)
2662 btrfs_mark_buffer_dirty(right); 2683 btrfs_mark_buffer_dirty(right);
2684 else
2685 clean_tree_block(trans, root, right);
2663 2686
2664 btrfs_item_key(right, &disk_key, 0); 2687 btrfs_item_key(right, &disk_key, 0);
2665 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2688 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2669 /* then fixup the leaf pointer in the path */ 2692 /* then fixup the leaf pointer in the path */
2670 if (path->slots[0] < push_items) { 2693 if (path->slots[0] < push_items) {
2671 path->slots[0] += old_left_nritems; 2694 path->slots[0] += old_left_nritems;
2672 if (btrfs_header_nritems(path->nodes[0]) == 0)
2673 clean_tree_block(trans, root, path->nodes[0]);
2674 btrfs_tree_unlock(path->nodes[0]); 2695 btrfs_tree_unlock(path->nodes[0]);
2675 free_extent_buffer(path->nodes[0]); 2696 free_extent_buffer(path->nodes[0]);
2676 path->nodes[0] = left; 2697 path->nodes[0] = left;
@@ -2932,10 +2953,10 @@ again:
2932 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2953 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2933 root->root_key.objectid, 2954 root->root_key.objectid,
2934 &disk_key, 0, l->start, 0); 2955 &disk_key, 0, l->start, 0);
2935 if (IS_ERR(right)) { 2956 if (IS_ERR(right))
2936 BUG_ON(1);
2937 return PTR_ERR(right); 2957 return PTR_ERR(right);
2938 } 2958
2959 root_add_used(root, root->leafsize);
2939 2960
2940 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 2961 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2941 btrfs_set_header_bytenr(right, right->start); 2962 btrfs_set_header_bytenr(right, right->start);
@@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3054 3075
3055 btrfs_set_path_blocking(path); 3076 btrfs_set_path_blocking(path);
3056 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3077 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3057 BUG_ON(ret); 3078 if (ret)
3079 goto err;
3058 3080
3059 path->keep_locks = 0; 3081 path->keep_locks = 0;
3060 btrfs_unlock_up_safe(path, 1); 3082 btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3796 */ 3818 */
3797 btrfs_unlock_up_safe(path, 0); 3819 btrfs_unlock_up_safe(path, 0);
3798 3820
3799 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, 3821 root_sub_used(root, leaf->len);
3800 0, root->root_key.objectid, 0); 3822
3801 return ret; 3823 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3824 return 0;
3802} 3825}
3803/* 3826/*
3804 * delete the item at the leaf level in path. If that empties 3827 * delete the item at the leaf level in path. If that empties
@@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3865 if (leaf == root->node) { 3888 if (leaf == root->node) {
3866 btrfs_set_header_level(leaf, 0); 3889 btrfs_set_header_level(leaf, 0);
3867 } else { 3890 } else {
3891 btrfs_set_path_blocking(path);
3892 clean_tree_block(trans, root, leaf);
3868 ret = btrfs_del_leaf(trans, root, path, leaf); 3893 ret = btrfs_del_leaf(trans, root, path, leaf);
3869 BUG_ON(ret); 3894 BUG_ON(ret);
3870 } 3895 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
34 34
35struct btrfs_trans_handle; 35struct btrfs_trans_handle;
36struct btrfs_transaction; 36struct btrfs_transaction;
37struct btrfs_pending_snapshot;
37extern struct kmem_cache *btrfs_trans_handle_cachep; 38extern struct kmem_cache *btrfs_trans_handle_cachep;
38extern struct kmem_cache *btrfs_transaction_cachep; 39extern struct kmem_cache *btrfs_transaction_cachep;
39extern struct kmem_cache *btrfs_bit_radix_cachep; 40extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
663#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 664#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
664#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 665#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
665#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 666#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
667#define BTRFS_NR_RAID_TYPES 5
666 668
667struct btrfs_block_group_item { 669struct btrfs_block_group_item {
668 __le64 used; 670 __le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
674 u64 flags; 676 u64 flags;
675 677
676 u64 total_bytes; /* total bytes in the space */ 678 u64 total_bytes; /* total bytes in the space */
677 u64 bytes_used; /* total bytes used on disk */ 679 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */
678 u64 bytes_pinned; /* total bytes pinned, will be freed when the 681 u64 bytes_pinned; /* total bytes pinned, will be freed when the
679 transaction finishes */ 682 transaction finishes */
680 u64 bytes_reserved; /* total bytes the allocator has reserved for 683 u64 bytes_reserved; /* total bytes the allocator has reserved for
681 current allocations */ 684 current allocations */
682 u64 bytes_readonly; /* total bytes that are read only */ 685 u64 bytes_readonly; /* total bytes that are read only */
683 u64 bytes_super; /* total bytes reserved for the super blocks */ 686
684 u64 bytes_root; /* the number of bytes needed to commit a
685 transaction */
686 u64 bytes_may_use; /* number of bytes that may be used for 687 u64 bytes_may_use; /* number of bytes that may be used for
687 delalloc/allocations */ 688 delalloc/allocations */
688 u64 bytes_delalloc; /* number of bytes currently reserved for 689 u64 disk_used; /* total bytes used on disk */
689 delayed allocation */
690 690
691 int full; /* indicates that we cannot allocate any more 691 int full; /* indicates that we cannot allocate any more
692 chunks for this space */ 692 chunks for this space */
693 int force_alloc; /* set if we need to force a chunk alloc for 693 int force_alloc; /* set if we need to force a chunk alloc for
694 this space */ 694 this space */
695 int force_delalloc; /* make people start doing filemap_flush until
696 we're under a threshold */
697 695
698 struct list_head list; 696 struct list_head list;
699 697
700 /* for controlling how we free up space for allocations */
701 wait_queue_head_t allocate_wait;
702 wait_queue_head_t flush_wait;
703 int allocating_chunk;
704 int flushing;
705
706 /* for block groups in our same type */ 698 /* for block groups in our same type */
707 struct list_head block_groups; 699 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
708 spinlock_t lock; 700 spinlock_t lock;
709 struct rw_semaphore groups_sem; 701 struct rw_semaphore groups_sem;
710 atomic_t caching_threads; 702 atomic_t caching_threads;
711}; 703};
712 704
705struct btrfs_block_rsv {
706 u64 size;
707 u64 reserved;
708 u64 freed[2];
709 struct btrfs_space_info *space_info;
710 struct list_head list;
711 spinlock_t lock;
712 atomic_t usage;
713 unsigned int priority:8;
714 unsigned int durable:1;
715 unsigned int refill_used:1;
716 unsigned int full:1;
717};
718
713/* 719/*
714 * free clusters are used to claim free space in relatively large chunks, 720 * free clusters are used to claim free space in relatively large chunks,
715 * allowing us to do less seeky writes. They are used for all metadata 721 * allowing us to do less seeky writes. They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
760 spinlock_t lock; 766 spinlock_t lock;
761 u64 pinned; 767 u64 pinned;
762 u64 reserved; 768 u64 reserved;
769 u64 reserved_pinned;
763 u64 bytes_super; 770 u64 bytes_super;
764 u64 flags; 771 u64 flags;
765 u64 sectorsize; 772 u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
825 /* logical->physical extent mapping */ 832 /* logical->physical extent mapping */
826 struct btrfs_mapping_tree mapping_tree; 833 struct btrfs_mapping_tree mapping_tree;
827 834
835 /* block reservation for extent, checksum and root tree */
836 struct btrfs_block_rsv global_block_rsv;
837 /* block reservation for delay allocation */
838 struct btrfs_block_rsv delalloc_block_rsv;
839 /* block reservation for metadata operations */
840 struct btrfs_block_rsv trans_block_rsv;
841 /* block reservation for chunk tree */
842 struct btrfs_block_rsv chunk_block_rsv;
843
844 struct btrfs_block_rsv empty_block_rsv;
845
846 /* list of block reservations that cross multiple transactions */
847 struct list_head durable_block_rsv_list;
848
849 struct mutex durable_block_rsv_mutex;
850
828 u64 generation; 851 u64 generation;
829 u64 last_trans_committed; 852 u64 last_trans_committed;
830 853
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
927 struct btrfs_workers endio_meta_write_workers; 950 struct btrfs_workers endio_meta_write_workers;
928 struct btrfs_workers endio_write_workers; 951 struct btrfs_workers endio_write_workers;
929 struct btrfs_workers submit_workers; 952 struct btrfs_workers submit_workers;
930 struct btrfs_workers enospc_workers;
931 /* 953 /*
932 * fixup workers take dirty pages that didn't properly go through 954 * fixup workers take dirty pages that didn't properly go through
933 * the cow mechanism and make them safe to write. It happens 955 * the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
943 int do_barriers; 965 int do_barriers;
944 int closing; 966 int closing;
945 int log_root_recovering; 967 int log_root_recovering;
968 int enospc_unlink;
946 969
947 u64 total_pinned; 970 u64 total_pinned;
948 971
@@ -1012,6 +1035,9 @@ struct btrfs_root {
1012 struct completion kobj_unregister; 1035 struct completion kobj_unregister;
1013 struct mutex objectid_mutex; 1036 struct mutex objectid_mutex;
1014 1037
1038 spinlock_t accounting_lock;
1039 struct btrfs_block_rsv *block_rsv;
1040
1015 struct mutex log_mutex; 1041 struct mutex log_mutex;
1016 wait_queue_head_t log_writer_wait; 1042 wait_queue_head_t log_writer_wait;
1017 wait_queue_head_t log_commit_wait[2]; 1043 wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
1043 int ref_cows; 1069 int ref_cows;
1044 int track_dirty; 1070 int track_dirty;
1045 int in_radix; 1071 int in_radix;
1046 int clean_orphans;
1047 1072
1048 u64 defrag_trans_start; 1073 u64 defrag_trans_start;
1049 struct btrfs_key defrag_progress; 1074 struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
1057 1082
1058 struct list_head root_list; 1083 struct list_head root_list;
1059 1084
1060 spinlock_t list_lock; 1085 spinlock_t orphan_lock;
1061 struct list_head orphan_list; 1086 struct list_head orphan_list;
1087 struct btrfs_block_rsv *orphan_block_rsv;
1088 int orphan_item_inserted;
1089 int orphan_cleanup_state;
1062 1090
1063 spinlock_t inode_lock; 1091 spinlock_t inode_lock;
1064 /* red-black tree that keeps track of in-memory inodes */ 1092 /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1965int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1966 struct btrfs_root *root, unsigned long count); 1994 struct btrfs_root *root, unsigned long count);
1967int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1995int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1996int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, u64 bytenr,
1998 u64 num_bytes, u64 *refs, u64 *flags);
1968int btrfs_pin_extent(struct btrfs_root *root, 1999int btrfs_pin_extent(struct btrfs_root *root,
1969 u64 bytenr, u64 num, int reserved); 2000 u64 bytenr, u64 num, int reserved);
1970int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 2001int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1984 u64 parent, u64 root_objectid, 2015 u64 parent, u64 root_objectid,
1985 struct btrfs_disk_key *key, int level, 2016 struct btrfs_disk_key *key, int level,
1986 u64 hint, u64 empty_size); 2017 u64 hint, u64 empty_size);
1987int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2018void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1988 struct btrfs_root *root, 2019 struct btrfs_root *root,
1989 u64 bytenr, u32 blocksize, 2020 struct extent_buffer *buf,
1990 u64 parent, u64 root_objectid, int level); 2021 u64 parent, int last_ref);
1991struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2022struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1992 struct btrfs_root *root, 2023 struct btrfs_root *root,
1993 u64 bytenr, u32 blocksize, 2024 u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2041 u64 size); 2072 u64 size);
2042int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2073int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2043 struct btrfs_root *root, u64 group_start); 2074 struct btrfs_root *root, u64 group_start);
2044int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
2045 struct btrfs_block_group_cache *group);
2046
2047u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2075u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2048void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2076void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2049void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2077void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2050 2078int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2051int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); 2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2052int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); 2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2053int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2081 struct btrfs_root *root,
2054 struct inode *inode, int num_items); 2082 int num_items, int *retries);
2055int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2056 struct inode *inode, int num_items); 2084 struct btrfs_root *root);
2057int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
2058 u64 bytes); 2086 struct inode *inode);
2059void btrfs_free_reserved_data_space(struct btrfs_root *root, 2087void btrfs_orphan_release_metadata(struct inode *inode);
2060 struct inode *inode, u64 bytes); 2088int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
2061void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 2089 struct btrfs_pending_snapshot *pending);
2062 u64 bytes); 2090int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2063void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2091void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2064 u64 bytes); 2092int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2093void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2094void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2095struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2096void btrfs_free_block_rsv(struct btrfs_root *root,
2097 struct btrfs_block_rsv *rsv);
2098void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2099 struct btrfs_block_rsv *rsv);
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv,
2107 u64 min_reserved, int min_factor);
2108int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2109 struct btrfs_block_rsv *dst_rsv,
2110 u64 num_bytes);
2111void btrfs_block_rsv_release(struct btrfs_root *root,
2112 struct btrfs_block_rsv *block_rsv,
2113 u64 num_bytes);
2114int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache);
2065/* ctree.c */ 2118/* ctree.c */
2066int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2067 int level, int *slot); 2120 int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2152int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2205int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2153int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2206int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2154int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2207int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2155int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); 2208int btrfs_drop_snapshot(struct btrfs_root *root,
2209 struct btrfs_block_rsv *block_rsv, int update_ref);
2156int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2210int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2157 struct btrfs_root *root, 2211 struct btrfs_root *root,
2158 struct extent_buffer *node, 2212 struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
2245 struct btrfs_root *root, 2299 struct btrfs_root *root,
2246 const char *name, int name_len, 2300 const char *name, int name_len,
2247 u64 inode_objectid, u64 ref_objectid, u64 *index); 2301 u64 inode_objectid, u64 ref_objectid, u64 *index);
2302struct btrfs_inode_ref *
2303btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
2304 struct btrfs_root *root,
2305 struct btrfs_path *path,
2306 const char *name, int name_len,
2307 u64 inode_objectid, u64 ref_objectid, int mod);
2248int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 2308int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root, 2309 struct btrfs_root *root,
2250 struct btrfs_path *path, u64 objectid); 2310 struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2257 struct btrfs_root *root, u64 bytenr, u64 len); 2317 struct btrfs_root *root, u64 bytenr, u64 len);
2258int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2259 struct bio *bio, u32 *dst); 2319 struct bio *bio, u32 *dst);
2320int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2321 struct bio *bio, u64 logical_offset, u32 *dst);
2260int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2322int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2261 struct btrfs_root *root, 2323 struct btrfs_root *root,
2262 u64 objectid, u64 pos, 2324 u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2311 u32 min_type); 2373 u32 min_type);
2312 2374
2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state); 2378 struct extent_state **cached_state);
2316int btrfs_writepages(struct address_space *mapping, 2379int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2349int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2412int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2350int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2413int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2351void btrfs_orphan_cleanup(struct btrfs_root *root); 2414void btrfs_orphan_cleanup(struct btrfs_root *root);
2415void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2416 struct btrfs_pending_snapshot *pending,
2417 u64 *bytes_to_reserve);
2418void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2419 struct btrfs_pending_snapshot *pending);
2420void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root);
2352int btrfs_cont_expand(struct inode *inode, loff_t size); 2422int btrfs_cont_expand(struct inode *inode, loff_t size);
2353int btrfs_invalidate_inodes(struct btrfs_root *root); 2423int btrfs_invalidate_inodes(struct btrfs_root *root);
2354void btrfs_add_delayed_iput(struct inode *inode); 2424void btrfs_add_delayed_iput(struct inode *inode);
2355void btrfs_run_delayed_iputs(struct btrfs_root *root); 2425void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint);
2356extern const struct dentry_operations btrfs_dentry_operations; 2429extern const struct dentry_operations btrfs_dentry_operations;
2357 2430
2358/* ioctl.c */ 2431/* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
2361void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2434void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2362 2435
2363/* file.c */ 2436/* file.c */
2364int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2437int btrfs_sync_file(struct file *file, int datasync);
2365int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2438int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2366 int skip_pinned); 2439 int skip_pinned);
2367int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2440int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2409 struct btrfs_root *root); 2482 struct btrfs_root *root);
2410int btrfs_recover_relocation(struct btrfs_root *root); 2483int btrfs_recover_relocation(struct btrfs_root *root);
2411int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 2484int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2485void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
2486 struct btrfs_root *root, struct extent_buffer *buf,
2487 struct extent_buffer *cow);
2488void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2489 struct btrfs_pending_snapshot *pending,
2490 u64 *bytes_to_reserve);
2491void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2492 struct btrfs_pending_snapshot *pending);
2412#endif 2493#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
319} 319}
320 320
321/* 321/*
322 * helper function to lookup reference count and flags of extent.
323 *
324 * the head node for delayed ref is used to store the sum of all the
325 * reference count modifications queued up in the rbtree. the head
326 * node may also store the extent flags to set. This way you can check
327 * to see what the reference count and extent flags would be if all of
328 * the delayed refs are not processed.
329 */
330int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
331 struct btrfs_root *root, u64 bytenr,
332 u64 num_bytes, u64 *refs, u64 *flags)
333{
334 struct btrfs_delayed_ref_node *ref;
335 struct btrfs_delayed_ref_head *head;
336 struct btrfs_delayed_ref_root *delayed_refs;
337 struct btrfs_path *path;
338 struct btrfs_extent_item *ei;
339 struct extent_buffer *leaf;
340 struct btrfs_key key;
341 u32 item_size;
342 u64 num_refs;
343 u64 extent_flags;
344 int ret;
345
346 path = btrfs_alloc_path();
347 if (!path)
348 return -ENOMEM;
349
350 key.objectid = bytenr;
351 key.type = BTRFS_EXTENT_ITEM_KEY;
352 key.offset = num_bytes;
353 delayed_refs = &trans->transaction->delayed_refs;
354again:
355 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
356 &key, path, 0, 0);
357 if (ret < 0)
358 goto out;
359
360 if (ret == 0) {
361 leaf = path->nodes[0];
362 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
363 if (item_size >= sizeof(*ei)) {
364 ei = btrfs_item_ptr(leaf, path->slots[0],
365 struct btrfs_extent_item);
366 num_refs = btrfs_extent_refs(leaf, ei);
367 extent_flags = btrfs_extent_flags(leaf, ei);
368 } else {
369#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
370 struct btrfs_extent_item_v0 *ei0;
371 BUG_ON(item_size != sizeof(*ei0));
372 ei0 = btrfs_item_ptr(leaf, path->slots[0],
373 struct btrfs_extent_item_v0);
374 num_refs = btrfs_extent_refs_v0(leaf, ei0);
375 /* FIXME: this isn't correct for data */
376 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
377#else
378 BUG();
379#endif
380 }
381 BUG_ON(num_refs == 0);
382 } else {
383 num_refs = 0;
384 extent_flags = 0;
385 ret = 0;
386 }
387
388 spin_lock(&delayed_refs->lock);
389 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
390 if (ref) {
391 head = btrfs_delayed_node_to_head(ref);
392 if (!mutex_trylock(&head->mutex)) {
393 atomic_inc(&ref->refs);
394 spin_unlock(&delayed_refs->lock);
395
396 btrfs_release_path(root->fs_info->extent_root, path);
397
398 mutex_lock(&head->mutex);
399 mutex_unlock(&head->mutex);
400 btrfs_put_delayed_ref(ref);
401 goto again;
402 }
403 if (head->extent_op && head->extent_op->update_flags)
404 extent_flags |= head->extent_op->flags_to_set;
405 else
406 BUG_ON(num_refs == 0);
407
408 num_refs += ref->ref_mod;
409 mutex_unlock(&head->mutex);
410 }
411 WARN_ON(num_refs == 0);
412 if (refs)
413 *refs = num_refs;
414 if (flags)
415 *flags = extent_flags;
416out:
417 spin_unlock(&delayed_refs->lock);
418 btrfs_free_path(path);
419 return ret;
420}
421
422/*
423 * helper function to update an extent delayed ref in the 322 * helper function to update an extent delayed ref in the
424 * rbtree. existing and update must both have the same 323 * rbtree. existing and update must both have the same
425 * bytenr and parent 324 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, u64 bytenr,
172 u64 num_bytes, u64 *refs, u64 *flags);
173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
174 u64 bytenr, u64 num_bytes, u64 orig_parent, 171 u64 bytenr, u64 num_bytes, u64 orig_parent,
175 u64 parent, u64 orig_ref_root, u64 ref_root, 172 u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..34f7c375567e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
74 int rw; 74 int rw;
75 int mirror_num; 75 int mirror_num;
76 unsigned long bio_flags; 76 unsigned long bio_flags;
77 /*
78 * bio_offset is optional, can be used if the pages in the bio
79 * can't tell us where in the file the bio should go
80 */
81 u64 bio_offset;
77 struct btrfs_work work; 82 struct btrfs_work work;
78}; 83};
79 84
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
534 async = container_of(work, struct async_submit_bio, work); 539 async = container_of(work, struct async_submit_bio, work);
535 fs_info = BTRFS_I(async->inode)->root->fs_info; 540 fs_info = BTRFS_I(async->inode)->root->fs_info;
536 async->submit_bio_start(async->inode, async->rw, async->bio, 541 async->submit_bio_start(async->inode, async->rw, async->bio,
537 async->mirror_num, async->bio_flags); 542 async->mirror_num, async->bio_flags,
543 async->bio_offset);
538} 544}
539 545
540static void run_one_async_done(struct btrfs_work *work) 546static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
556 wake_up(&fs_info->async_submit_wait); 562 wake_up(&fs_info->async_submit_wait);
557 563
558 async->submit_bio_done(async->inode, async->rw, async->bio, 564 async->submit_bio_done(async->inode, async->rw, async->bio,
559 async->mirror_num, async->bio_flags); 565 async->mirror_num, async->bio_flags,
566 async->bio_offset);
560} 567}
561 568
562static void run_one_async_free(struct btrfs_work *work) 569static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
570int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 577int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
571 int rw, struct bio *bio, int mirror_num, 578 int rw, struct bio *bio, int mirror_num,
572 unsigned long bio_flags, 579 unsigned long bio_flags,
580 u64 bio_offset,
573 extent_submit_bio_hook_t *submit_bio_start, 581 extent_submit_bio_hook_t *submit_bio_start,
574 extent_submit_bio_hook_t *submit_bio_done) 582 extent_submit_bio_hook_t *submit_bio_done)
575{ 583{
@@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
592 600
593 async->work.flags = 0; 601 async->work.flags = 0;
594 async->bio_flags = bio_flags; 602 async->bio_flags = bio_flags;
603 async->bio_offset = bio_offset;
595 604
596 atomic_inc(&fs_info->nr_async_submits); 605 atomic_inc(&fs_info->nr_async_submits);
597 606
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
627 636
628static int __btree_submit_bio_start(struct inode *inode, int rw, 637static int __btree_submit_bio_start(struct inode *inode, int rw,
629 struct bio *bio, int mirror_num, 638 struct bio *bio, int mirror_num,
630 unsigned long bio_flags) 639 unsigned long bio_flags,
640 u64 bio_offset)
631{ 641{
632 /* 642 /*
633 * when we're called for a write, we're already in the async 643 * when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
638} 648}
639 649
640static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 650static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
641 int mirror_num, unsigned long bio_flags) 651 int mirror_num, unsigned long bio_flags,
652 u64 bio_offset)
642{ 653{
643 /* 654 /*
644 * when we're called for a write, we're already in the async 655 * when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
648} 659}
649 660
650static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 661static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
651 int mirror_num, unsigned long bio_flags) 662 int mirror_num, unsigned long bio_flags,
663 u64 bio_offset)
652{ 664{
653 int ret; 665 int ret;
654 666
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
671 */ 683 */
672 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 684 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
673 inode, rw, bio, mirror_num, 0, 685 inode, rw, bio, mirror_num, 0,
686 bio_offset,
674 __btree_submit_bio_start, 687 __btree_submit_bio_start,
675 __btree_submit_bio_done); 688 __btree_submit_bio_done);
676} 689}
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
894 root->ref_cows = 0; 907 root->ref_cows = 0;
895 root->track_dirty = 0; 908 root->track_dirty = 0;
896 root->in_radix = 0; 909 root->in_radix = 0;
897 root->clean_orphans = 0; 910 root->orphan_item_inserted = 0;
911 root->orphan_cleanup_state = 0;
898 912
899 root->fs_info = fs_info; 913 root->fs_info = fs_info;
900 root->objectid = objectid; 914 root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
903 root->name = NULL; 917 root->name = NULL;
904 root->in_sysfs = 0; 918 root->in_sysfs = 0;
905 root->inode_tree = RB_ROOT; 919 root->inode_tree = RB_ROOT;
920 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL;
906 922
907 INIT_LIST_HEAD(&root->dirty_list); 923 INIT_LIST_HEAD(&root->dirty_list);
908 INIT_LIST_HEAD(&root->orphan_list); 924 INIT_LIST_HEAD(&root->orphan_list);
909 INIT_LIST_HEAD(&root->root_list); 925 INIT_LIST_HEAD(&root->root_list);
910 spin_lock_init(&root->node_lock); 926 spin_lock_init(&root->node_lock);
911 spin_lock_init(&root->list_lock); 927 spin_lock_init(&root->orphan_lock);
912 spin_lock_init(&root->inode_lock); 928 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock);
913 mutex_init(&root->objectid_mutex); 930 mutex_init(&root->objectid_mutex);
914 mutex_init(&root->log_mutex); 931 mutex_init(&root->log_mutex);
915 init_waitqueue_head(&root->log_writer_wait); 932 init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
968 return 0; 985 return 0;
969} 986}
970 987
971int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
972 struct btrfs_fs_info *fs_info)
973{
974 struct extent_buffer *eb;
975 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
976 u64 start = 0;
977 u64 end = 0;
978 int ret;
979
980 if (!log_root_tree)
981 return 0;
982
983 while (1) {
984 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
985 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
986 if (ret)
987 break;
988
989 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
990 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
991 }
992 eb = fs_info->log_root_tree->node;
993
994 WARN_ON(btrfs_header_level(eb) != 0);
995 WARN_ON(btrfs_header_nritems(eb) != 0);
996
997 ret = btrfs_free_reserved_extent(fs_info->tree_root,
998 eb->start, eb->len);
999 BUG_ON(ret);
1000
1001 free_extent_buffer(eb);
1002 kfree(fs_info->log_root_tree);
1003 fs_info->log_root_tree = NULL;
1004 return 0;
1005}
1006
1007static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 988static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1008 struct btrfs_fs_info *fs_info) 989 struct btrfs_fs_info *fs_info)
1009{ 990{
@@ -1191,19 +1172,23 @@ again:
1191 if (root) 1172 if (root)
1192 return root; 1173 return root;
1193 1174
1194 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1195 if (ret == 0)
1196 ret = -ENOENT;
1197 if (ret < 0)
1198 return ERR_PTR(ret);
1199
1200 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1175 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1201 if (IS_ERR(root)) 1176 if (IS_ERR(root))
1202 return root; 1177 return root;
1203 1178
1204 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1205 set_anon_super(&root->anon_super, NULL); 1179 set_anon_super(&root->anon_super, NULL);
1206 1180
1181 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT;
1183 goto fail;
1184 }
1185
1186 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1187 if (ret < 0)
1188 goto fail;
1189 if (ret == 0)
1190 root->orphan_item_inserted = 1;
1191
1207 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1192 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1208 if (ret) 1193 if (ret)
1209 goto fail; 1194 goto fail;
@@ -1212,10 +1197,9 @@ again:
1212 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1197 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1213 (unsigned long)root->root_key.objectid, 1198 (unsigned long)root->root_key.objectid,
1214 root); 1199 root);
1215 if (ret == 0) { 1200 if (ret == 0)
1216 root->in_radix = 1; 1201 root->in_radix = 1;
1217 root->clean_orphans = 1; 1202
1218 }
1219 spin_unlock(&fs_info->fs_roots_radix_lock); 1203 spin_unlock(&fs_info->fs_roots_radix_lock);
1220 radix_tree_preload_end(); 1204 radix_tree_preload_end();
1221 if (ret) { 1205 if (ret) {
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
1461 struct btrfs_root *root = arg; 1445 struct btrfs_root *root = arg;
1462 1446
1463 do { 1447 do {
1464 smp_mb();
1465 if (root->fs_info->closing)
1466 break;
1467
1468 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1448 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1469 1449
1470 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1450 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
1477 if (freezing(current)) { 1457 if (freezing(current)) {
1478 refrigerator(); 1458 refrigerator();
1479 } else { 1459 } else {
1480 smp_mb();
1481 if (root->fs_info->closing)
1482 break;
1483 set_current_state(TASK_INTERRUPTIBLE); 1460 set_current_state(TASK_INTERRUPTIBLE);
1484 schedule(); 1461 if (!kthread_should_stop())
1462 schedule();
1485 __set_current_state(TASK_RUNNING); 1463 __set_current_state(TASK_RUNNING);
1486 } 1464 }
1487 } while (!kthread_should_stop()); 1465 } while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
1493 struct btrfs_root *root = arg; 1471 struct btrfs_root *root = arg;
1494 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1495 struct btrfs_transaction *cur; 1473 struct btrfs_transaction *cur;
1474 u64 transid;
1496 unsigned long now; 1475 unsigned long now;
1497 unsigned long delay; 1476 unsigned long delay;
1498 int ret; 1477 int ret;
1499 1478
1500 do { 1479 do {
1501 smp_mb();
1502 if (root->fs_info->closing)
1503 break;
1504
1505 delay = HZ * 30; 1480 delay = HZ * 30;
1506 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1507 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1482 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1508 1483
1509 mutex_lock(&root->fs_info->trans_mutex); 1484 spin_lock(&root->fs_info->new_trans_lock);
1510 cur = root->fs_info->running_transaction; 1485 cur = root->fs_info->running_transaction;
1511 if (!cur) { 1486 if (!cur) {
1512 mutex_unlock(&root->fs_info->trans_mutex); 1487 spin_unlock(&root->fs_info->new_trans_lock);
1513 goto sleep; 1488 goto sleep;
1514 } 1489 }
1515 1490
1516 now = get_seconds(); 1491 now = get_seconds();
1517 if (now < cur->start_time || now - cur->start_time < 30) { 1492 if (!cur->blocked &&
1518 mutex_unlock(&root->fs_info->trans_mutex); 1493 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock);
1519 delay = HZ * 5; 1495 delay = HZ * 5;
1520 goto sleep; 1496 goto sleep;
1521 } 1497 }
1522 mutex_unlock(&root->fs_info->trans_mutex); 1498 transid = cur->transid;
1523 trans = btrfs_start_transaction(root, 1); 1499 spin_unlock(&root->fs_info->new_trans_lock);
1524 ret = btrfs_commit_transaction(trans, root);
1525 1500
1501 trans = btrfs_join_transaction(root, 1);
1502 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret);
1505 } else {
1506 btrfs_end_transaction(trans, root);
1507 }
1526sleep: 1508sleep:
1527 wake_up_process(root->fs_info->cleaner_kthread); 1509 wake_up_process(root->fs_info->cleaner_kthread);
1528 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1510 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
1530 if (freezing(current)) { 1512 if (freezing(current)) {
1531 refrigerator(); 1513 refrigerator();
1532 } else { 1514 } else {
1533 if (root->fs_info->closing)
1534 break;
1535 set_current_state(TASK_INTERRUPTIBLE); 1515 set_current_state(TASK_INTERRUPTIBLE);
1536 schedule_timeout(delay); 1516 if (!kthread_should_stop() &&
1517 !btrfs_transaction_blocked(root->fs_info))
1518 schedule_timeout(delay);
1537 __set_current_state(TASK_RUNNING); 1519 __set_current_state(TASK_RUNNING);
1538 } 1520 }
1539 } while (!kthread_should_stop()); 1521 } while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1620 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1602 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1621 INIT_LIST_HEAD(&fs_info->space_info); 1603 INIT_LIST_HEAD(&fs_info->space_info);
1622 btrfs_mapping_init(&fs_info->mapping_tree); 1604 btrfs_mapping_init(&fs_info->mapping_tree);
1605 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1606 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1607 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1608 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1609 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1610 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1611 mutex_init(&fs_info->durable_block_rsv_mutex);
1623 atomic_set(&fs_info->nr_async_submits, 0); 1612 atomic_set(&fs_info->nr_async_submits, 0);
1624 atomic_set(&fs_info->async_delalloc_pages, 0); 1613 atomic_set(&fs_info->async_delalloc_pages, 0);
1625 atomic_set(&fs_info->async_submit_draining, 0); 1614 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1759 min_t(u64, fs_devices->num_devices, 1748 min_t(u64, fs_devices->num_devices,
1760 fs_info->thread_pool_size), 1749 fs_info->thread_pool_size),
1761 &fs_info->generic_worker); 1750 &fs_info->generic_worker);
1762 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1763 fs_info->thread_pool_size,
1764 &fs_info->generic_worker);
1765 1751
1766 /* a higher idle thresh on the submit workers makes it much more 1752 /* a higher idle thresh on the submit workers makes it much more
1767 * likely that bios will be send down in a sane order to the 1753 * likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1809 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1810 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1811 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1812 btrfs_start_workers(&fs_info->enospc_workers, 1);
1813 1798
1814 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1815 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1912 1897
1913 csum_root->track_dirty = 1; 1898 csum_root->track_dirty = 1;
1914 1899
1900 fs_info->generation = generation;
1901 fs_info->last_trans_committed = generation;
1902 fs_info->data_alloc_profile = (u64)-1;
1903 fs_info->metadata_alloc_profile = (u64)-1;
1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905
1915 ret = btrfs_read_block_groups(extent_root); 1906 ret = btrfs_read_block_groups(extent_root);
1916 if (ret) { 1907 if (ret) {
1917 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1918 goto fail_block_groups; 1909 goto fail_block_groups;
1919 } 1910 }
1920 1911
1921 fs_info->generation = generation;
1922 fs_info->last_trans_committed = generation;
1923 fs_info->data_alloc_profile = (u64)-1;
1924 fs_info->metadata_alloc_profile = (u64)-1;
1925 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1926 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1912 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1927 "btrfs-cleaner"); 1913 "btrfs-cleaner");
1928 if (IS_ERR(fs_info->cleaner_kthread)) 1914 if (IS_ERR(fs_info->cleaner_kthread))
@@ -1955,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1955 btrfs_level_size(tree_root, 1941 btrfs_level_size(tree_root,
1956 btrfs_super_log_root_level(disk_super)); 1942 btrfs_super_log_root_level(disk_super));
1957 1943
1958 log_tree_root = kzalloc(sizeof(struct btrfs_root), 1944 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1959 GFP_NOFS); 1945 if (!log_tree_root) {
1946 err = -ENOMEM;
1947 goto fail_trans_kthread;
1948 }
1960 1949
1961 __setup_root(nodesize, leafsize, sectorsize, stripesize, 1950 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1962 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1951 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1977,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1977 BUG_ON(ret); 1966 BUG_ON(ret);
1978 1967
1979 if (!(sb->s_flags & MS_RDONLY)) { 1968 if (!(sb->s_flags & MS_RDONLY)) {
1969 ret = btrfs_cleanup_fs_roots(fs_info);
1970 BUG_ON(ret);
1971
1980 ret = btrfs_recover_relocation(tree_root); 1972 ret = btrfs_recover_relocation(tree_root);
1981 if (ret < 0) { 1973 if (ret < 0) {
1982 printk(KERN_WARNING 1974 printk(KERN_WARNING
@@ -1993,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1993 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 1985 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1994 if (!fs_info->fs_root) 1986 if (!fs_info->fs_root)
1995 goto fail_trans_kthread; 1987 goto fail_trans_kthread;
1988 if (IS_ERR(fs_info->fs_root)) {
1989 err = PTR_ERR(fs_info->fs_root);
1990 goto fail_trans_kthread;
1991 }
1996 1992
1997 if (!(sb->s_flags & MS_RDONLY)) { 1993 if (!(sb->s_flags & MS_RDONLY)) {
1998 down_read(&fs_info->cleanup_work_sem); 1994 down_read(&fs_info->cleanup_work_sem);
@@ -2040,7 +2036,6 @@ fail_sb_buffer:
2040 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2041 btrfs_stop_workers(&fs_info->endio_write_workers); 2037 btrfs_stop_workers(&fs_info->endio_write_workers);
2042 btrfs_stop_workers(&fs_info->submit_workers); 2038 btrfs_stop_workers(&fs_info->submit_workers);
2043 btrfs_stop_workers(&fs_info->enospc_workers);
2044fail_iput: 2039fail_iput:
2045 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2040 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2046 iput(fs_info->btree_inode); 2041 iput(fs_info->btree_inode);
@@ -2405,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root)
2405 down_write(&root->fs_info->cleanup_work_sem); 2400 down_write(&root->fs_info->cleanup_work_sem);
2406 up_write(&root->fs_info->cleanup_work_sem); 2401 up_write(&root->fs_info->cleanup_work_sem);
2407 2402
2408 trans = btrfs_start_transaction(root, 1); 2403 trans = btrfs_join_transaction(root, 1);
2409 ret = btrfs_commit_transaction(trans, root); 2404 ret = btrfs_commit_transaction(trans, root);
2410 BUG_ON(ret); 2405 BUG_ON(ret);
2411 /* run commit again to drop the original snapshot */ 2406 /* run commit again to drop the original snapshot */
2412 trans = btrfs_start_transaction(root, 1); 2407 trans = btrfs_join_transaction(root, 1);
2413 btrfs_commit_transaction(trans, root); 2408 btrfs_commit_transaction(trans, root);
2414 ret = btrfs_write_and_wait_transaction(NULL, root); 2409 ret = btrfs_write_and_wait_transaction(NULL, root);
2415 BUG_ON(ret); 2410 BUG_ON(ret);
@@ -2426,15 +2421,15 @@ int close_ctree(struct btrfs_root *root)
2426 fs_info->closing = 1; 2421 fs_info->closing = 1;
2427 smp_mb(); 2422 smp_mb();
2428 2423
2429 kthread_stop(root->fs_info->transaction_kthread);
2430 kthread_stop(root->fs_info->cleaner_kthread);
2431
2432 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2424 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2433 ret = btrfs_commit_super(root); 2425 ret = btrfs_commit_super(root);
2434 if (ret) 2426 if (ret)
2435 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2427 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2436 } 2428 }
2437 2429
2430 kthread_stop(root->fs_info->transaction_kthread);
2431 kthread_stop(root->fs_info->cleaner_kthread);
2432
2438 fs_info->closing = 2; 2433 fs_info->closing = 2;
2439 smp_mb(); 2434 smp_mb();
2440 2435
@@ -2473,7 +2468,6 @@ int close_ctree(struct btrfs_root *root)
2473 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2468 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2474 btrfs_stop_workers(&fs_info->endio_write_workers); 2469 btrfs_stop_workers(&fs_info->endio_write_workers);
2475 btrfs_stop_workers(&fs_info->submit_workers); 2470 btrfs_stop_workers(&fs_info->submit_workers);
2476 btrfs_stop_workers(&fs_info->enospc_workers);
2477 2471
2478 btrfs_close_devices(fs_info->fs_devices); 2472 btrfs_close_devices(fs_info->fs_devices);
2479 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2473 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 87 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
89 int rw, struct bio *bio, int mirror_num, 89 int rw, struct bio *bio, int mirror_num,
90 unsigned long bio_flags, 90 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 91 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 92 extent_submit_bio_hook_t *submit_bio_done);
93 93
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 96int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
98int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
99 struct btrfs_fs_info *fs_info);
100int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 98int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101 struct btrfs_fs_info *fs_info); 99 struct btrfs_fs_info *fs_info);
102int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 100int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6a4f459ad76..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
35 35
36static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 37 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc);
39 int mark_free); 39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve, int sinfo);
41 u64 num_bytes, int reserve);
42static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
43 struct btrfs_root *root, 42 struct btrfs_root *root,
44 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
61static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
63 u64 flags, int force); 62 u64 flags, int force);
64static int pin_down_bytes(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 struct btrfs_path *path,
67 u64 bytenr, u64 num_bytes,
68 int is_data, int reserved,
69 struct extent_buffer **must_clean);
70static int find_next_key(struct btrfs_path *path, int level, 63static int find_next_key(struct btrfs_path *path, int level,
71 struct btrfs_key *key); 64 struct btrfs_key *key);
72static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
91 84
92void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
93{ 86{
94 if (atomic_dec_and_test(&cache->count)) 87 if (atomic_dec_and_test(&cache->count)) {
88 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0);
95 kfree(cache); 91 kfree(cache);
92 }
96} 93}
97 94
98/* 95/*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
319 316
320 exclude_super_stripes(extent_root, block_group); 317 exclude_super_stripes(extent_root, block_group);
321 spin_lock(&block_group->space_info->lock); 318 spin_lock(&block_group->space_info->lock);
322 block_group->space_info->bytes_super += block_group->bytes_super; 319 block_group->space_info->bytes_readonly += block_group->bytes_super;
323 spin_unlock(&block_group->space_info->lock); 320 spin_unlock(&block_group->space_info->lock);
324 321
325 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
507 struct list_head *head = &info->space_info; 504 struct list_head *head = &info->space_info;
508 struct btrfs_space_info *found; 505 struct btrfs_space_info *found;
509 506
507 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
508 BTRFS_BLOCK_GROUP_METADATA;
509
510 rcu_read_lock(); 510 rcu_read_lock();
511 list_for_each_entry_rcu(found, head, list) { 511 list_for_each_entry_rcu(found, head, list) {
512 if (found->flags == flags) { 512 if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
610} 610}
611 611
612/* 612/*
613 * helper function to lookup reference count and flags of extent.
614 *
615 * the head node for delayed ref is used to store the sum of all the
616 * reference count modifications queued up in the rbtree. the head
617 * node may also store the extent flags to set. This way you can check
618 * to see what the reference count and extent flags would be if all of
619 * the delayed refs are not processed.
620 */
621int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
622 struct btrfs_root *root, u64 bytenr,
623 u64 num_bytes, u64 *refs, u64 *flags)
624{
625 struct btrfs_delayed_ref_head *head;
626 struct btrfs_delayed_ref_root *delayed_refs;
627 struct btrfs_path *path;
628 struct btrfs_extent_item *ei;
629 struct extent_buffer *leaf;
630 struct btrfs_key key;
631 u32 item_size;
632 u64 num_refs;
633 u64 extent_flags;
634 int ret;
635
636 path = btrfs_alloc_path();
637 if (!path)
638 return -ENOMEM;
639
640 key.objectid = bytenr;
641 key.type = BTRFS_EXTENT_ITEM_KEY;
642 key.offset = num_bytes;
643 if (!trans) {
644 path->skip_locking = 1;
645 path->search_commit_root = 1;
646 }
647again:
648 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
649 &key, path, 0, 0);
650 if (ret < 0)
651 goto out_free;
652
653 if (ret == 0) {
654 leaf = path->nodes[0];
655 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
656 if (item_size >= sizeof(*ei)) {
657 ei = btrfs_item_ptr(leaf, path->slots[0],
658 struct btrfs_extent_item);
659 num_refs = btrfs_extent_refs(leaf, ei);
660 extent_flags = btrfs_extent_flags(leaf, ei);
661 } else {
662#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
663 struct btrfs_extent_item_v0 *ei0;
664 BUG_ON(item_size != sizeof(*ei0));
665 ei0 = btrfs_item_ptr(leaf, path->slots[0],
666 struct btrfs_extent_item_v0);
667 num_refs = btrfs_extent_refs_v0(leaf, ei0);
668 /* FIXME: this isn't correct for data */
669 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
670#else
671 BUG();
672#endif
673 }
674 BUG_ON(num_refs == 0);
675 } else {
676 num_refs = 0;
677 extent_flags = 0;
678 ret = 0;
679 }
680
681 if (!trans)
682 goto out;
683
684 delayed_refs = &trans->transaction->delayed_refs;
685 spin_lock(&delayed_refs->lock);
686 head = btrfs_find_delayed_ref_head(trans, bytenr);
687 if (head) {
688 if (!mutex_trylock(&head->mutex)) {
689 atomic_inc(&head->node.refs);
690 spin_unlock(&delayed_refs->lock);
691
692 btrfs_release_path(root->fs_info->extent_root, path);
693
694 mutex_lock(&head->mutex);
695 mutex_unlock(&head->mutex);
696 btrfs_put_delayed_ref(&head->node);
697 goto again;
698 }
699 if (head->extent_op && head->extent_op->update_flags)
700 extent_flags |= head->extent_op->flags_to_set;
701 else
702 BUG_ON(num_refs == 0);
703
704 num_refs += head->node.ref_mod;
705 mutex_unlock(&head->mutex);
706 }
707 spin_unlock(&delayed_refs->lock);
708out:
709 WARN_ON(num_refs == 0);
710 if (refs)
711 *refs = num_refs;
712 if (flags)
713 *flags = extent_flags;
714out_free:
715 btrfs_free_path(path);
716 return ret;
717}
718
719/*
613 * Back reference rules. Back refs have three main goals: 720 * Back reference rules. Back refs have three main goals:
614 * 721 *
615 * 1) differentiate between all holders of references to an extent so that 722 * 1) differentiate between all holders of references to an extent so that
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1871 return ret; 1978 return ret;
1872} 1979}
1873 1980
1874
1875/* helper function to actually process a single delayed ref entry */ 1981/* helper function to actually process a single delayed ref entry */
1876static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1982static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1877 struct btrfs_root *root, 1983 struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1891 BUG_ON(extent_op); 1997 BUG_ON(extent_op);
1892 head = btrfs_delayed_node_to_head(node); 1998 head = btrfs_delayed_node_to_head(node);
1893 if (insert_reserved) { 1999 if (insert_reserved) {
1894 int mark_free = 0; 2000 btrfs_pin_extent(root, node->bytenr,
1895 struct extent_buffer *must_clean = NULL; 2001 node->num_bytes, 1);
1896
1897 ret = pin_down_bytes(trans, root, NULL,
1898 node->bytenr, node->num_bytes,
1899 head->is_data, 1, &must_clean);
1900 if (ret > 0)
1901 mark_free = 1;
1902
1903 if (must_clean) {
1904 clean_tree_block(NULL, root, must_clean);
1905 btrfs_tree_unlock(must_clean);
1906 free_extent_buffer(must_clean);
1907 }
1908 if (head->is_data) { 2002 if (head->is_data) {
1909 ret = btrfs_del_csums(trans, root, 2003 ret = btrfs_del_csums(trans, root,
1910 node->bytenr, 2004 node->bytenr,
1911 node->num_bytes); 2005 node->num_bytes);
1912 BUG_ON(ret); 2006 BUG_ON(ret);
1913 } 2007 }
1914 if (mark_free) {
1915 ret = btrfs_free_reserved_extent(root,
1916 node->bytenr,
1917 node->num_bytes);
1918 BUG_ON(ret);
1919 }
1920 } 2008 }
1921 mutex_unlock(&head->mutex); 2009 mutex_unlock(&head->mutex);
1922 return 0; 2010 return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2347 ret = 0; 2435 ret = 0;
2348out: 2436out:
2349 btrfs_free_path(path); 2437 btrfs_free_path(path);
2438 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2439 WARN_ON(ret > 0);
2350 return ret; 2440 return ret;
2351} 2441}
2352 2442
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2660 struct btrfs_space_info **space_info) 2750 struct btrfs_space_info **space_info)
2661{ 2751{
2662 struct btrfs_space_info *found; 2752 struct btrfs_space_info *found;
2753 int i;
2754 int factor;
2755
2756 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2757 BTRFS_BLOCK_GROUP_RAID10))
2758 factor = 2;
2759 else
2760 factor = 1;
2663 2761
2664 found = __find_space_info(info, flags); 2762 found = __find_space_info(info, flags);
2665 if (found) { 2763 if (found) {
2666 spin_lock(&found->lock); 2764 spin_lock(&found->lock);
2667 found->total_bytes += total_bytes; 2765 found->total_bytes += total_bytes;
2668 found->bytes_used += bytes_used; 2766 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor;
2669 found->full = 0; 2768 found->full = 0;
2670 spin_unlock(&found->lock); 2769 spin_unlock(&found->lock);
2671 *space_info = found; 2770 *space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2675 if (!found) 2774 if (!found)
2676 return -ENOMEM; 2775 return -ENOMEM;
2677 2776
2678 INIT_LIST_HEAD(&found->block_groups); 2777 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2778 INIT_LIST_HEAD(&found->block_groups[i]);
2679 init_rwsem(&found->groups_sem); 2779 init_rwsem(&found->groups_sem);
2680 init_waitqueue_head(&found->flush_wait);
2681 init_waitqueue_head(&found->allocate_wait);
2682 spin_lock_init(&found->lock); 2780 spin_lock_init(&found->lock);
2683 found->flags = flags; 2781 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2782 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA);
2684 found->total_bytes = total_bytes; 2784 found->total_bytes = total_bytes;
2685 found->bytes_used = bytes_used; 2785 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor;
2686 found->bytes_pinned = 0; 2787 found->bytes_pinned = 0;
2687 found->bytes_reserved = 0; 2788 found->bytes_reserved = 0;
2688 found->bytes_readonly = 0; 2789 found->bytes_readonly = 0;
2689 found->bytes_delalloc = 0; 2790 found->bytes_may_use = 0;
2690 found->full = 0; 2791 found->full = 0;
2691 found->force_alloc = 0; 2792 found->force_alloc = 0;
2692 *space_info = found; 2793 *space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2711 } 2812 }
2712} 2813}
2713 2814
2714static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2715{
2716 spin_lock(&cache->space_info->lock);
2717 spin_lock(&cache->lock);
2718 if (!cache->ro) {
2719 cache->space_info->bytes_readonly += cache->key.offset -
2720 btrfs_block_group_used(&cache->item);
2721 cache->ro = 1;
2722 }
2723 spin_unlock(&cache->lock);
2724 spin_unlock(&cache->space_info->lock);
2725}
2726
2727u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2815u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2728{ 2816{
2729 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2817 u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2752 return flags; 2840 return flags;
2753} 2841}
2754 2842
2755static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2843static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2756{
2757 struct btrfs_fs_info *info = root->fs_info;
2758 u64 alloc_profile;
2759
2760 if (data) {
2761 alloc_profile = info->avail_data_alloc_bits &
2762 info->data_alloc_profile;
2763 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2764 } else if (root == root->fs_info->chunk_root) {
2765 alloc_profile = info->avail_system_alloc_bits &
2766 info->system_alloc_profile;
2767 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2768 } else {
2769 alloc_profile = info->avail_metadata_alloc_bits &
2770 info->metadata_alloc_profile;
2771 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2772 }
2773
2774 return btrfs_reduce_alloc_profile(root, data);
2775}
2776
2777void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2778{
2779 u64 alloc_target;
2780
2781 alloc_target = btrfs_get_alloc_profile(root, 1);
2782 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2783 alloc_target);
2784}
2785
2786static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2787{
2788 u64 num_bytes;
2789 int level;
2790
2791 level = BTRFS_MAX_LEVEL - 2;
2792 /*
2793 * NOTE: these calculations are absolutely the worst possible case.
2794 * This assumes that _every_ item we insert will require a new leaf, and
2795 * that the tree has grown to its maximum level size.
2796 */
2797
2798 /*
2799 * for every item we insert we could insert both an extent item and a
2800 * extent ref item. Then for ever item we insert, we will need to cow
2801 * both the original leaf, plus the leaf to the left and right of it.
2802 *
2803 * Unless we are talking about the extent root, then we just want the
2804 * number of items * 2, since we just need the extent item plus its ref.
2805 */
2806 if (root == root->fs_info->extent_root)
2807 num_bytes = num_items * 2;
2808 else
2809 num_bytes = (num_items + (2 * num_items)) * 3;
2810
2811 /*
2812 * num_bytes is total number of leaves we could need times the leaf
2813 * size, and then for every leaf we could end up cow'ing 2 nodes per
2814 * level, down to the leaf level.
2815 */
2816 num_bytes = (num_bytes * root->leafsize) +
2817 (num_bytes * (level * 2)) * root->nodesize;
2818
2819 return num_bytes;
2820}
2821
2822/*
2823 * Unreserve metadata space for delalloc. If we have less reserved credits than
2824 * we have extents, this function does nothing.
2825 */
2826int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2827 struct inode *inode, int num_items)
2828{
2829 struct btrfs_fs_info *info = root->fs_info;
2830 struct btrfs_space_info *meta_sinfo;
2831 u64 num_bytes;
2832 u64 alloc_target;
2833 bool bug = false;
2834
2835 /* get the space info for where the metadata will live */
2836 alloc_target = btrfs_get_alloc_profile(root, 0);
2837 meta_sinfo = __find_space_info(info, alloc_target);
2838
2839 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2840 num_items);
2841
2842 spin_lock(&meta_sinfo->lock);
2843 spin_lock(&BTRFS_I(inode)->accounting_lock);
2844 if (BTRFS_I(inode)->reserved_extents <=
2845 BTRFS_I(inode)->outstanding_extents) {
2846 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2847 spin_unlock(&meta_sinfo->lock);
2848 return 0;
2849 }
2850 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2851
2852 BTRFS_I(inode)->reserved_extents -= num_items;
2853 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2854
2855 if (meta_sinfo->bytes_delalloc < num_bytes) {
2856 bug = true;
2857 meta_sinfo->bytes_delalloc = 0;
2858 } else {
2859 meta_sinfo->bytes_delalloc -= num_bytes;
2860 }
2861 spin_unlock(&meta_sinfo->lock);
2862
2863 BUG_ON(bug);
2864
2865 return 0;
2866}
2867
2868static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2869{ 2844{
2870 u64 thresh; 2845 if (flags & BTRFS_BLOCK_GROUP_DATA)
2871 2846 flags |= root->fs_info->avail_data_alloc_bits &
2872 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2847 root->fs_info->data_alloc_profile;
2873 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2848 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2874 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 2849 flags |= root->fs_info->avail_system_alloc_bits &
2875 meta_sinfo->bytes_may_use; 2850 root->fs_info->system_alloc_profile;
2876 2851 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2877 thresh = meta_sinfo->total_bytes - thresh; 2852 flags |= root->fs_info->avail_metadata_alloc_bits &
2878 thresh *= 80; 2853 root->fs_info->metadata_alloc_profile;
2879 do_div(thresh, 100); 2854 return btrfs_reduce_alloc_profile(root, flags);
2880 if (thresh <= meta_sinfo->bytes_delalloc)
2881 meta_sinfo->force_delalloc = 1;
2882 else
2883 meta_sinfo->force_delalloc = 0;
2884} 2855}
2885 2856
2886struct async_flush { 2857static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2887 struct btrfs_root *root;
2888 struct btrfs_space_info *info;
2889 struct btrfs_work work;
2890};
2891
2892static noinline void flush_delalloc_async(struct btrfs_work *work)
2893{ 2858{
2894 struct async_flush *async; 2859 u64 flags;
2895 struct btrfs_root *root;
2896 struct btrfs_space_info *info;
2897
2898 async = container_of(work, struct async_flush, work);
2899 root = async->root;
2900 info = async->info;
2901
2902 btrfs_start_delalloc_inodes(root, 0);
2903 wake_up(&info->flush_wait);
2904 btrfs_wait_ordered_extents(root, 0, 0);
2905
2906 spin_lock(&info->lock);
2907 info->flushing = 0;
2908 spin_unlock(&info->lock);
2909 wake_up(&info->flush_wait);
2910
2911 kfree(async);
2912}
2913
2914static void wait_on_flush(struct btrfs_space_info *info)
2915{
2916 DEFINE_WAIT(wait);
2917 u64 used;
2918
2919 while (1) {
2920 prepare_to_wait(&info->flush_wait, &wait,
2921 TASK_UNINTERRUPTIBLE);
2922 spin_lock(&info->lock);
2923 if (!info->flushing) {
2924 spin_unlock(&info->lock);
2925 break;
2926 }
2927
2928 used = info->bytes_used + info->bytes_reserved +
2929 info->bytes_pinned + info->bytes_readonly +
2930 info->bytes_super + info->bytes_root +
2931 info->bytes_may_use + info->bytes_delalloc;
2932 if (used < info->total_bytes) {
2933 spin_unlock(&info->lock);
2934 break;
2935 }
2936 spin_unlock(&info->lock);
2937 schedule();
2938 }
2939 finish_wait(&info->flush_wait, &wait);
2940}
2941
2942static void flush_delalloc(struct btrfs_root *root,
2943 struct btrfs_space_info *info)
2944{
2945 struct async_flush *async;
2946 bool wait = false;
2947
2948 spin_lock(&info->lock);
2949 2860
2950 if (!info->flushing) 2861 if (data)
2951 info->flushing = 1; 2862 flags = BTRFS_BLOCK_GROUP_DATA;
2863 else if (root == root->fs_info->chunk_root)
2864 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2952 else 2865 else
2953 wait = true; 2866 flags = BTRFS_BLOCK_GROUP_METADATA;
2954
2955 spin_unlock(&info->lock);
2956
2957 if (wait) {
2958 wait_on_flush(info);
2959 return;
2960 }
2961
2962 async = kzalloc(sizeof(*async), GFP_NOFS);
2963 if (!async)
2964 goto flush;
2965
2966 async->root = root;
2967 async->info = info;
2968 async->work.func = flush_delalloc_async;
2969
2970 btrfs_queue_worker(&root->fs_info->enospc_workers,
2971 &async->work);
2972 wait_on_flush(info);
2973 return;
2974
2975flush:
2976 btrfs_start_delalloc_inodes(root, 0);
2977 btrfs_wait_ordered_extents(root, 0, 0);
2978
2979 spin_lock(&info->lock);
2980 info->flushing = 0;
2981 spin_unlock(&info->lock);
2982 wake_up(&info->flush_wait);
2983}
2984
2985static int maybe_allocate_chunk(struct btrfs_root *root,
2986 struct btrfs_space_info *info)
2987{
2988 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2989 struct btrfs_trans_handle *trans;
2990 bool wait = false;
2991 int ret = 0;
2992 u64 min_metadata;
2993 u64 free_space;
2994
2995 free_space = btrfs_super_total_bytes(disk_super);
2996 /*
2997 * we allow the metadata to grow to a max of either 10gb or 5% of the
2998 * space in the volume.
2999 */
3000 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
3001 div64_u64(free_space * 5, 100));
3002 if (info->total_bytes >= min_metadata) {
3003 spin_unlock(&info->lock);
3004 return 0;
3005 }
3006
3007 if (info->full) {
3008 spin_unlock(&info->lock);
3009 return 0;
3010 }
3011
3012 if (!info->allocating_chunk) {
3013 info->force_alloc = 1;
3014 info->allocating_chunk = 1;
3015 } else {
3016 wait = true;
3017 }
3018
3019 spin_unlock(&info->lock);
3020
3021 if (wait) {
3022 wait_event(info->allocate_wait,
3023 !info->allocating_chunk);
3024 return 1;
3025 }
3026
3027 trans = btrfs_start_transaction(root, 1);
3028 if (!trans) {
3029 ret = -ENOMEM;
3030 goto out;
3031 }
3032
3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3034 4096 + 2 * 1024 * 1024,
3035 info->flags, 0);
3036 btrfs_end_transaction(trans, root);
3037 if (ret)
3038 goto out;
3039out:
3040 spin_lock(&info->lock);
3041 info->allocating_chunk = 0;
3042 spin_unlock(&info->lock);
3043 wake_up(&info->allocate_wait);
3044
3045 if (ret)
3046 return 0;
3047 return 1;
3048}
3049
3050/*
3051 * Reserve metadata space for delalloc.
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054 struct inode *inode, int num_items)
3055{
3056 struct btrfs_fs_info *info = root->fs_info;
3057 struct btrfs_space_info *meta_sinfo;
3058 u64 num_bytes;
3059 u64 used;
3060 u64 alloc_target;
3061 int flushed = 0;
3062 int force_delalloc;
3063
3064 /* get the space info for where the metadata will live */
3065 alloc_target = btrfs_get_alloc_profile(root, 0);
3066 meta_sinfo = __find_space_info(info, alloc_target);
3067
3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3069 num_items);
3070again:
3071 spin_lock(&meta_sinfo->lock);
3072
3073 force_delalloc = meta_sinfo->force_delalloc;
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!flushed)
3079 meta_sinfo->bytes_delalloc += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 flushed++;
3088
3089 if (flushed == 1) {
3090 if (maybe_allocate_chunk(root, meta_sinfo))
3091 goto again;
3092 flushed++;
3093 } else {
3094 spin_unlock(&meta_sinfo->lock);
3095 }
3096
3097 if (flushed == 2) {
3098 filemap_flush(inode->i_mapping);
3099 goto again;
3100 } else if (flushed == 3) {
3101 flush_delalloc(root, meta_sinfo);
3102 goto again;
3103 }
3104 spin_lock(&meta_sinfo->lock);
3105 meta_sinfo->bytes_delalloc -= num_bytes;
3106 spin_unlock(&meta_sinfo->lock);
3107 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108 BTRFS_I(inode)->outstanding_extents,
3109 BTRFS_I(inode)->reserved_extents);
3110 dump_space_info(meta_sinfo, 0, 0);
3111 return -ENOSPC;
3112 }
3113 2867
3114 BTRFS_I(inode)->reserved_extents += num_items; 2868 return get_alloc_profile(root, flags);
3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock);
3117
3118 if (!flushed && force_delalloc)
3119 filemap_flush(inode->i_mapping);
3120
3121 return 0;
3122} 2869}
3123 2870
3124/* 2871void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{
3135 struct btrfs_fs_info *info = root->fs_info;
3136 struct btrfs_space_info *meta_sinfo;
3137 u64 num_bytes;
3138 u64 alloc_target;
3139 bool bug = false;
3140
3141 /* get the space info for where the metadata will live */
3142 alloc_target = btrfs_get_alloc_profile(root, 0);
3143 meta_sinfo = __find_space_info(info, alloc_target);
3144
3145 num_bytes = calculate_bytes_needed(root, num_items);
3146
3147 spin_lock(&meta_sinfo->lock);
3148 if (meta_sinfo->bytes_may_use < num_bytes) {
3149 bug = true;
3150 meta_sinfo->bytes_may_use = 0;
3151 } else {
3152 meta_sinfo->bytes_may_use -= num_bytes;
3153 }
3154 spin_unlock(&meta_sinfo->lock);
3155
3156 BUG_ON(bug);
3157
3158 return 0;
3159}
3160
3161/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number
3163 * of bytes that would be needed to modify num_items number of items. If we
3164 * have space, fantastic, if not, you get -ENOSPC. Please call
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3175{ 2872{
3176 struct btrfs_fs_info *info = root->fs_info; 2873 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3177 struct btrfs_space_info *meta_sinfo; 2874 BTRFS_BLOCK_GROUP_DATA);
3178 u64 num_bytes;
3179 u64 used;
3180 u64 alloc_target;
3181 int retries = 0;
3182
3183 /* get the space info for where the metadata will live */
3184 alloc_target = btrfs_get_alloc_profile(root, 0);
3185 meta_sinfo = __find_space_info(info, alloc_target);
3186
3187 num_bytes = calculate_bytes_needed(root, num_items);
3188again:
3189 spin_lock(&meta_sinfo->lock);
3190
3191 if (unlikely(!meta_sinfo->bytes_root))
3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3193
3194 if (!retries)
3195 meta_sinfo->bytes_may_use += num_bytes;
3196
3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3201
3202 if (used > meta_sinfo->total_bytes) {
3203 retries++;
3204 if (retries == 1) {
3205 if (maybe_allocate_chunk(root, meta_sinfo))
3206 goto again;
3207 retries++;
3208 } else {
3209 spin_unlock(&meta_sinfo->lock);
3210 }
3211
3212 if (retries == 2) {
3213 flush_delalloc(root, meta_sinfo);
3214 goto again;
3215 }
3216 spin_lock(&meta_sinfo->lock);
3217 meta_sinfo->bytes_may_use -= num_bytes;
3218 spin_unlock(&meta_sinfo->lock);
3219
3220 dump_space_info(meta_sinfo, 0, 0);
3221 return -ENOSPC;
3222 }
3223
3224 check_force_delalloc(meta_sinfo);
3225 spin_unlock(&meta_sinfo->lock);
3226
3227 return 0;
3228} 2875}
3229 2876
3230/* 2877/*
3231 * This will check the space that the inode allocates from to make sure we have 2878 * This will check the space that the inode allocates from to make sure we have
3232 * enough space for bytes. 2879 * enough space for bytes.
3233 */ 2880 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2881int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3235 u64 bytes)
3236{ 2882{
3237 struct btrfs_space_info *data_sinfo; 2883 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root;
3238 u64 used; 2885 u64 used;
3239 int ret = 0, committed = 0, flushed = 0; 2886 int ret = 0, committed = 0;
3240 2887
3241 /* make sure bytes are sectorsize aligned */ 2888 /* make sure bytes are sectorsize aligned */
3242 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 2889 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3248again: 2895again:
3249 /* make sure we have enough space to handle the data first */ 2896 /* make sure we have enough space to handle the data first */
3250 spin_lock(&data_sinfo->lock); 2897 spin_lock(&data_sinfo->lock);
3251 used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + 2898 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3252 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + 2899 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3253 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + 2900 data_sinfo->bytes_may_use;
3254 data_sinfo->bytes_super;
3255 2901
3256 if (used + bytes > data_sinfo->total_bytes) { 2902 if (used + bytes > data_sinfo->total_bytes) {
3257 struct btrfs_trans_handle *trans; 2903 struct btrfs_trans_handle *trans;
3258 2904
3259 if (!flushed) {
3260 spin_unlock(&data_sinfo->lock);
3261 flush_delalloc(root, data_sinfo);
3262 flushed = 1;
3263 goto again;
3264 }
3265
3266 /* 2905 /*
3267 * if we don't have enough free bytes in this space then we need 2906 * if we don't have enough free bytes in this space then we need
3268 * to alloc a new chunk. 2907 * to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
3274 spin_unlock(&data_sinfo->lock); 2913 spin_unlock(&data_sinfo->lock);
3275alloc: 2914alloc:
3276 alloc_target = btrfs_get_alloc_profile(root, 1); 2915 alloc_target = btrfs_get_alloc_profile(root, 1);
3277 trans = btrfs_start_transaction(root, 1); 2916 trans = btrfs_join_transaction(root, 1);
3278 if (!trans) 2917 if (IS_ERR(trans))
3279 return -ENOMEM; 2918 return PTR_ERR(trans);
3280 2919
3281 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2920 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3282 bytes + 2 * 1024 * 1024, 2921 bytes + 2 * 1024 * 1024,
3283 alloc_target, 0); 2922 alloc_target, 0);
3284 btrfs_end_transaction(trans, root); 2923 btrfs_end_transaction(trans, root);
3285 if (ret) 2924 if (ret < 0)
3286 return ret; 2925 return ret;
3287 2926
3288 if (!data_sinfo) { 2927 if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
3297 if (!committed && !root->fs_info->open_ioctl_trans) { 2936 if (!committed && !root->fs_info->open_ioctl_trans) {
3298 committed = 1; 2937 committed = 1;
3299 trans = btrfs_join_transaction(root, 1); 2938 trans = btrfs_join_transaction(root, 1);
3300 if (!trans) 2939 if (IS_ERR(trans))
3301 return -ENOMEM; 2940 return PTR_ERR(trans);
3302 ret = btrfs_commit_transaction(trans, root); 2941 ret = btrfs_commit_transaction(trans, root);
3303 if (ret) 2942 if (ret)
3304 return ret; 2943 return ret;
3305 goto again; 2944 goto again;
3306 } 2945 }
3307 2946
3308 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2947#if 0 /* I hope we never need this code again, just in case */
3309 ", %llu bytes_used, %llu bytes_reserved, " 2948 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3310 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 2949 "%llu bytes_reserved, " "%llu bytes_pinned, "
3311 "%llu total\n", (unsigned long long)bytes, 2950 "%llu bytes_readonly, %llu may use %llu total\n",
3312 (unsigned long long)data_sinfo->bytes_delalloc, 2951 (unsigned long long)bytes,
3313 (unsigned long long)data_sinfo->bytes_used, 2952 (unsigned long long)data_sinfo->bytes_used,
3314 (unsigned long long)data_sinfo->bytes_reserved, 2953 (unsigned long long)data_sinfo->bytes_reserved,
3315 (unsigned long long)data_sinfo->bytes_pinned, 2954 (unsigned long long)data_sinfo->bytes_pinned,
3316 (unsigned long long)data_sinfo->bytes_readonly, 2955 (unsigned long long)data_sinfo->bytes_readonly,
3317 (unsigned long long)data_sinfo->bytes_may_use, 2956 (unsigned long long)data_sinfo->bytes_may_use,
3318 (unsigned long long)data_sinfo->total_bytes); 2957 (unsigned long long)data_sinfo->total_bytes);
2958#endif
3319 return -ENOSPC; 2959 return -ENOSPC;
3320 } 2960 }
3321 data_sinfo->bytes_may_use += bytes; 2961 data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
3326} 2966}
3327 2967
3328/* 2968/*
3329 * if there was an error for whatever reason after calling 2969 * called when we are clearing an delalloc extent from the
3330 * btrfs_check_data_free_space, call this so we can cleanup the counters. 2970 * inode's io_tree or there was an error for whatever reason
2971 * after calling btrfs_check_data_free_space
3331 */ 2972 */
3332void btrfs_free_reserved_data_space(struct btrfs_root *root, 2973void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3333 struct inode *inode, u64 bytes)
3334{ 2974{
2975 struct btrfs_root *root = BTRFS_I(inode)->root;
3335 struct btrfs_space_info *data_sinfo; 2976 struct btrfs_space_info *data_sinfo;
3336 2977
3337 /* make sure bytes are sectorsize aligned */ 2978 /* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
3344 spin_unlock(&data_sinfo->lock); 2985 spin_unlock(&data_sinfo->lock);
3345} 2986}
3346 2987
3347/* called when we are adding a delalloc extent to the inode's io_tree */
3348void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3349 u64 bytes)
3350{
3351 struct btrfs_space_info *data_sinfo;
3352
3353 /* get the space info for where this inode will be storing its data */
3354 data_sinfo = BTRFS_I(inode)->space_info;
3355
3356 /* make sure we have enough space to handle the data first */
3357 spin_lock(&data_sinfo->lock);
3358 data_sinfo->bytes_delalloc += bytes;
3359
3360 /*
3361 * we are adding a delalloc extent without calling
3362 * btrfs_check_data_free_space first. This happens on a weird
3363 * writepage condition, but shouldn't hurt our accounting
3364 */
3365 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3366 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3367 BTRFS_I(inode)->reserved_bytes = 0;
3368 } else {
3369 data_sinfo->bytes_may_use -= bytes;
3370 BTRFS_I(inode)->reserved_bytes -= bytes;
3371 }
3372
3373 spin_unlock(&data_sinfo->lock);
3374}
3375
3376/* called when we are clearing an delalloc extent from the inode's io_tree */
3377void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3378 u64 bytes)
3379{
3380 struct btrfs_space_info *info;
3381
3382 info = BTRFS_I(inode)->space_info;
3383
3384 spin_lock(&info->lock);
3385 info->bytes_delalloc -= bytes;
3386 spin_unlock(&info->lock);
3387}
3388
3389static void force_metadata_allocation(struct btrfs_fs_info *info) 2988static void force_metadata_allocation(struct btrfs_fs_info *info)
3390{ 2989{
3391 struct list_head *head = &info->space_info; 2990 struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3399 rcu_read_unlock(); 2998 rcu_read_unlock();
3400} 2999}
3401 3000
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3002 u64 alloc_bytes)
3003{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3005
3006 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3008 return 0;
3009
3010 if (sinfo->bytes_used + sinfo->bytes_reserved +
3011 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0;
3013
3014 return 1;
3015}
3016
3402static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3017static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3403 struct btrfs_root *extent_root, u64 alloc_bytes, 3018 struct btrfs_root *extent_root, u64 alloc_bytes,
3404 u64 flags, int force) 3019 u64 flags, int force)
3405{ 3020{
3406 struct btrfs_space_info *space_info; 3021 struct btrfs_space_info *space_info;
3407 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3022 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3408 u64 thresh;
3409 int ret = 0; 3023 int ret = 0;
3410 3024
3411 mutex_lock(&fs_info->chunk_mutex); 3025 mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3428 goto out; 3042 goto out;
3429 } 3043 }
3430 3044
3431 thresh = space_info->total_bytes - space_info->bytes_readonly; 3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3432 thresh = div_factor(thresh, 8);
3433 if (!force &&
3434 (space_info->bytes_used + space_info->bytes_pinned +
3435 space_info->bytes_reserved + alloc_bytes) < thresh) {
3436 spin_unlock(&space_info->lock); 3046 spin_unlock(&space_info->lock);
3437 goto out; 3047 goto out;
3438 } 3048 }
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3454 spin_lock(&space_info->lock); 3064 spin_lock(&space_info->lock);
3455 if (ret) 3065 if (ret)
3456 space_info->full = 1; 3066 space_info->full = 1;
3067 else
3068 ret = 1;
3457 space_info->force_alloc = 0; 3069 space_info->force_alloc = 0;
3458 spin_unlock(&space_info->lock); 3070 spin_unlock(&space_info->lock);
3459out: 3071out:
@@ -3461,13 +3073,713 @@ out:
3461 return ret; 3073 return ret;
3462} 3074}
3463 3075
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/*
3109 * shrink metadata reservation for delalloc
3110 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim)
3113{
3114 struct btrfs_block_rsv *block_rsv;
3115 u64 reserved;
3116 u64 max_reclaim;
3117 u64 reclaimed = 0;
3118 int pause = 1;
3119 int ret;
3120
3121 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock);
3123 reserved = block_rsv->reserved;
3124 spin_unlock(&block_rsv->lock);
3125
3126 if (reserved == 0)
3127 return 0;
3128
3129 max_reclaim = min(reserved, to_reclaim);
3130
3131 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3133 if (!ret) {
3134 __set_current_state(TASK_INTERRUPTIBLE);
3135 schedule_timeout(pause);
3136 pause <<= 1;
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142
3143 spin_lock(&block_rsv->lock);
3144 if (reserved > block_rsv->reserved)
3145 reclaimed = reserved - block_rsv->reserved;
3146 reserved = block_rsv->reserved;
3147 spin_unlock(&block_rsv->lock);
3148
3149 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break;
3151
3152 if (trans && trans->transaction->blocked)
3153 return -EAGAIN;
3154 }
3155 return reclaimed >= to_reclaim;
3156}
3157
3158static int should_retry_reserve(struct btrfs_trans_handle *trans,
3159 struct btrfs_root *root,
3160 struct btrfs_block_rsv *block_rsv,
3161 u64 num_bytes, int *retries)
3162{
3163 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret;
3165
3166 if ((*retries) > 2)
3167 return -ENOSPC;
3168
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3170 if (ret)
3171 return 1;
3172
3173 if (trans && trans->transaction->in_commit)
3174 return -ENOSPC;
3175
3176 ret = shrink_delalloc(trans, root, num_bytes);
3177 if (ret)
3178 return ret;
3179
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188
3189 if (trans)
3190 return -EAGAIN;
3191
3192 trans = btrfs_join_transaction(root, 1);
3193 BUG_ON(IS_ERR(trans));
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196
3197 return 1;
3198}
3199
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3201 u64 num_bytes)
3202{
3203 struct btrfs_space_info *space_info = block_rsv->space_info;
3204 u64 unused;
3205 int ret = -ENOSPC;
3206
3207 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved +
3209 space_info->bytes_pinned + space_info->bytes_readonly;
3210
3211 if (unused < space_info->total_bytes)
3212 unused = space_info->total_bytes - unused;
3213 else
3214 unused = 0;
3215
3216 if (unused >= num_bytes) {
3217 if (block_rsv->priority >= 10) {
3218 space_info->bytes_reserved += num_bytes;
3219 ret = 0;
3220 } else {
3221 if ((unused + block_rsv->reserved) *
3222 block_rsv->priority >=
3223 (num_bytes + block_rsv->reserved) * 10) {
3224 space_info->bytes_reserved += num_bytes;
3225 ret = 0;
3226 }
3227 }
3228 }
3229 spin_unlock(&space_info->lock);
3230
3231 return ret;
3232}
3233
3234static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3235 struct btrfs_root *root)
3236{
3237 struct btrfs_block_rsv *block_rsv;
3238 if (root->ref_cows)
3239 block_rsv = trans->block_rsv;
3240 else
3241 block_rsv = root->block_rsv;
3242
3243 if (!block_rsv)
3244 block_rsv = &root->fs_info->empty_block_rsv;
3245
3246 return block_rsv;
3247}
3248
3249static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3250 u64 num_bytes)
3251{
3252 int ret = -ENOSPC;
3253 spin_lock(&block_rsv->lock);
3254 if (block_rsv->reserved >= num_bytes) {
3255 block_rsv->reserved -= num_bytes;
3256 if (block_rsv->reserved < block_rsv->size)
3257 block_rsv->full = 0;
3258 ret = 0;
3259 }
3260 spin_unlock(&block_rsv->lock);
3261 return ret;
3262}
3263
3264static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3265 u64 num_bytes, int update_size)
3266{
3267 spin_lock(&block_rsv->lock);
3268 block_rsv->reserved += num_bytes;
3269 if (update_size)
3270 block_rsv->size += num_bytes;
3271 else if (block_rsv->reserved >= block_rsv->size)
3272 block_rsv->full = 1;
3273 spin_unlock(&block_rsv->lock);
3274}
3275
3276void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3277 struct btrfs_block_rsv *dest, u64 num_bytes)
3278{
3279 struct btrfs_space_info *space_info = block_rsv->space_info;
3280
3281 spin_lock(&block_rsv->lock);
3282 if (num_bytes == (u64)-1)
3283 num_bytes = block_rsv->size;
3284 block_rsv->size -= num_bytes;
3285 if (block_rsv->reserved >= block_rsv->size) {
3286 num_bytes = block_rsv->reserved - block_rsv->size;
3287 block_rsv->reserved = block_rsv->size;
3288 block_rsv->full = 1;
3289 } else {
3290 num_bytes = 0;
3291 }
3292 spin_unlock(&block_rsv->lock);
3293
3294 if (num_bytes > 0) {
3295 if (dest) {
3296 block_rsv_add_bytes(dest, num_bytes, 0);
3297 } else {
3298 spin_lock(&space_info->lock);
3299 space_info->bytes_reserved -= num_bytes;
3300 spin_unlock(&space_info->lock);
3301 }
3302 }
3303}
3304
3305static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3306 struct btrfs_block_rsv *dst, u64 num_bytes)
3307{
3308 int ret;
3309
3310 ret = block_rsv_use_bytes(src, num_bytes);
3311 if (ret)
3312 return ret;
3313
3314 block_rsv_add_bytes(dst, num_bytes, 1);
3315 return 0;
3316}
3317
3318void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3319{
3320 memset(rsv, 0, sizeof(*rsv));
3321 spin_lock_init(&rsv->lock);
3322 atomic_set(&rsv->usage, 1);
3323 rsv->priority = 6;
3324 INIT_LIST_HEAD(&rsv->list);
3325}
3326
3327struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{
3329 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv)
3335 return NULL;
3336
3337 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv;
3344}
3345
3346void btrfs_free_block_rsv(struct btrfs_root *root,
3347 struct btrfs_block_rsv *rsv)
3348{
3349 if (rsv && atomic_dec_and_test(&rsv->usage)) {
3350 btrfs_block_rsv_release(root, rsv, (u64)-1);
3351 if (!rsv->durable)
3352 kfree(rsv);
3353 }
3354}
3355
3356/*
3357 * make the block_rsv struct be able to capture freed space.
3358 * the captured space will re-add to the the block_rsv struct
3359 * after transaction commit
3360 */
3361void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3362 struct btrfs_block_rsv *block_rsv)
3363{
3364 block_rsv->durable = 1;
3365 mutex_lock(&fs_info->durable_block_rsv_mutex);
3366 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3367 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3368}
3369
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries)
3374{
3375 int ret;
3376
3377 if (num_bytes == 0)
3378 return 0;
3379again:
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3381 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0;
3384 }
3385
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret;
3391}
3392
3393int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3394 struct btrfs_root *root,
3395 struct btrfs_block_rsv *block_rsv,
3396 u64 min_reserved, int min_factor)
3397{
3398 u64 num_bytes = 0;
3399 int commit_trans = 0;
3400 int ret = -ENOSPC;
3401
3402 if (!block_rsv)
3403 return 0;
3404
3405 spin_lock(&block_rsv->lock);
3406 if (min_factor > 0)
3407 num_bytes = div_factor(block_rsv->size, min_factor);
3408 if (min_reserved > num_bytes)
3409 num_bytes = min_reserved;
3410
3411 if (block_rsv->reserved >= num_bytes) {
3412 ret = 0;
3413 } else {
3414 num_bytes -= block_rsv->reserved;
3415 if (block_rsv->durable &&
3416 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3417 commit_trans = 1;
3418 }
3419 spin_unlock(&block_rsv->lock);
3420 if (!ret)
3421 return 0;
3422
3423 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3425 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0;
3428 }
3429 }
3430
3431 if (commit_trans) {
3432 if (trans)
3433 return -EAGAIN;
3434
3435 trans = btrfs_join_transaction(root, 1);
3436 BUG_ON(IS_ERR(trans));
3437 ret = btrfs_commit_transaction(trans, root);
3438 return 0;
3439 }
3440
3441 WARN_ON(1);
3442 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3443 block_rsv->size, block_rsv->reserved,
3444 block_rsv->freed[0], block_rsv->freed[1]);
3445
3446 return -ENOSPC;
3447}
3448
3449int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3450 struct btrfs_block_rsv *dst_rsv,
3451 u64 num_bytes)
3452{
3453 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3454}
3455
3456void btrfs_block_rsv_release(struct btrfs_root *root,
3457 struct btrfs_block_rsv *block_rsv,
3458 u64 num_bytes)
3459{
3460 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3461 if (global_rsv->full || global_rsv == block_rsv ||
3462 block_rsv->space_info != global_rsv->space_info)
3463 global_rsv = NULL;
3464 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3465}
3466
3467/*
3468 * helper to calculate size of global block reservation.
3469 * the desired value is sum of space used by extent tree,
3470 * checksum tree and root tree
3471 */
3472static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3473{
3474 struct btrfs_space_info *sinfo;
3475 u64 num_bytes;
3476 u64 meta_used;
3477 u64 data_used;
3478 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3479#if 0
3480 /*
3481 * per tree used space accounting can be inaccuracy, so we
3482 * can't rely on it.
3483 */
3484 spin_lock(&fs_info->extent_root->accounting_lock);
3485 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3486 spin_unlock(&fs_info->extent_root->accounting_lock);
3487
3488 spin_lock(&fs_info->csum_root->accounting_lock);
3489 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3490 spin_unlock(&fs_info->csum_root->accounting_lock);
3491
3492 spin_lock(&fs_info->tree_root->accounting_lock);
3493 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3494 spin_unlock(&fs_info->tree_root->accounting_lock);
3495#endif
3496 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3497 spin_lock(&sinfo->lock);
3498 data_used = sinfo->bytes_used;
3499 spin_unlock(&sinfo->lock);
3500
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock);
3503 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock);
3505
3506 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3507 csum_size * 2;
3508 num_bytes += div64_u64(data_used + meta_used, 50);
3509
3510 if (num_bytes * 3 > meta_used)
3511 num_bytes = div64_u64(meta_used, 3);
3512
3513 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3514}
3515
3516static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3517{
3518 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3519 struct btrfs_space_info *sinfo = block_rsv->space_info;
3520 u64 num_bytes;
3521
3522 num_bytes = calc_global_metadata_size(fs_info);
3523
3524 spin_lock(&block_rsv->lock);
3525 spin_lock(&sinfo->lock);
3526
3527 block_rsv->size = num_bytes;
3528
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly;
3531
3532 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes;
3534 block_rsv->reserved += num_bytes;
3535 sinfo->bytes_reserved += num_bytes;
3536 }
3537
3538 if (block_rsv->reserved >= block_rsv->size) {
3539 num_bytes = block_rsv->reserved - block_rsv->size;
3540 sinfo->bytes_reserved -= num_bytes;
3541 block_rsv->reserved = block_rsv->size;
3542 block_rsv->full = 1;
3543 }
3544#if 0
3545 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3546 block_rsv->size, block_rsv->reserved);
3547#endif
3548 spin_unlock(&sinfo->lock);
3549 spin_unlock(&block_rsv->lock);
3550}
3551
3552static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3553{
3554 struct btrfs_space_info *space_info;
3555
3556 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3557 fs_info->chunk_block_rsv.space_info = space_info;
3558 fs_info->chunk_block_rsv.priority = 10;
3559
3560 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3561 fs_info->global_block_rsv.space_info = space_info;
3562 fs_info->global_block_rsv.priority = 10;
3563 fs_info->global_block_rsv.refill_used = 1;
3564 fs_info->delalloc_block_rsv.space_info = space_info;
3565 fs_info->trans_block_rsv.space_info = space_info;
3566 fs_info->empty_block_rsv.space_info = space_info;
3567 fs_info->empty_block_rsv.priority = 10;
3568
3569 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3570 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3571 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3572 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3573 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3574
3575 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3576
3577 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3578
3579 update_global_block_rsv(fs_info);
3580}
3581
3582static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3583{
3584 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3585 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3586 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3587 WARN_ON(fs_info->trans_block_rsv.size > 0);
3588 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3589 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3590 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3591}
3592
3593static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3594{
3595 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3596 3 * num_items;
3597}
3598
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root,
3601 int num_items, int *retries)
3602{
3603 u64 num_bytes;
3604 int ret;
3605
3606 if (num_items == 0 || root->fs_info->chunk_root == root)
3607 return 0;
3608
3609 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries);
3612 if (!ret) {
3613 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv;
3615 }
3616 return ret;
3617}
3618
3619void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3620 struct btrfs_root *root)
3621{
3622 if (!trans->bytes_reserved)
3623 return;
3624
3625 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3626 btrfs_block_rsv_release(root, trans->block_rsv,
3627 trans->bytes_reserved);
3628 trans->bytes_reserved = 0;
3629}
3630
3631int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3632 struct inode *inode)
3633{
3634 struct btrfs_root *root = BTRFS_I(inode)->root;
3635 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3636 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3637
3638 /*
3639 * one for deleting orphan item, one for updating inode and
3640 * two for calling btrfs_truncate_inode_items.
3641 *
3642 * btrfs_truncate_inode_items is a delete operation, it frees
3643 * more space than it uses in most cases. So two units of
3644 * metadata space should be enough for calling it many times.
3645 * If all of the metadata space is used, we can commit
3646 * transaction and use space it freed.
3647 */
3648 u64 num_bytes = calc_trans_metadata_size(root, 4);
3649 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3650}
3651
3652void btrfs_orphan_release_metadata(struct inode *inode)
3653{
3654 struct btrfs_root *root = BTRFS_I(inode)->root;
3655 u64 num_bytes = calc_trans_metadata_size(root, 4);
3656 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3657}
3658
3659int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3660 struct btrfs_pending_snapshot *pending)
3661{
3662 struct btrfs_root *root = pending->root;
3663 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3664 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3665 /*
3666 * two for root back/forward refs, two for directory entries
3667 * and one for root of the snapshot.
3668 */
3669 u64 num_bytes = calc_trans_metadata_size(root, 5);
3670 dst_rsv->space_info = src_rsv->space_info;
3671 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3672}
3673
3674static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3675{
3676 return num_bytes >>= 3;
3677}
3678
3679int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3680{
3681 struct btrfs_root *root = BTRFS_I(inode)->root;
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve;
3684 int nr_extents;
3685 int retries = 0;
3686 int ret;
3687
3688 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1);
3690
3691 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again:
3693 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3696 nr_extents -= BTRFS_I(inode)->reserved_extents;
3697 to_reserve = calc_trans_metadata_size(root, nr_extents);
3698 } else {
3699 nr_extents = 0;
3700 to_reserve = 0;
3701 }
3702
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve);
3705 if (ret) {
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret;
3712 }
3713
3714 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3717
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719
3720 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve);
3722
3723 return 0;
3724}
3725
3726void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3727{
3728 struct btrfs_root *root = BTRFS_I(inode)->root;
3729 u64 to_free;
3730 int nr_extents;
3731
3732 num_bytes = ALIGN(num_bytes, root->sectorsize);
3733 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3734
3735 spin_lock(&BTRFS_I(inode)->accounting_lock);
3736 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3737 if (nr_extents < BTRFS_I(inode)->reserved_extents) {
3738 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
3739 BTRFS_I(inode)->reserved_extents -= nr_extents;
3740 } else {
3741 nr_extents = 0;
3742 }
3743 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3744
3745 to_free = calc_csum_metadata_size(inode, num_bytes);
3746 if (nr_extents > 0)
3747 to_free += calc_trans_metadata_size(root, nr_extents);
3748
3749 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3750 to_free);
3751}
3752
3753int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
3754{
3755 int ret;
3756
3757 ret = btrfs_check_data_free_space(inode, num_bytes);
3758 if (ret)
3759 return ret;
3760
3761 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3762 if (ret) {
3763 btrfs_free_reserved_data_space(inode, num_bytes);
3764 return ret;
3765 }
3766
3767 return 0;
3768}
3769
3770void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
3771{
3772 btrfs_delalloc_release_metadata(inode, num_bytes);
3773 btrfs_free_reserved_data_space(inode, num_bytes);
3774}
3775
3464static int update_block_group(struct btrfs_trans_handle *trans, 3776static int update_block_group(struct btrfs_trans_handle *trans,
3465 struct btrfs_root *root, 3777 struct btrfs_root *root,
3466 u64 bytenr, u64 num_bytes, int alloc, 3778 u64 bytenr, u64 num_bytes, int alloc)
3467 int mark_free)
3468{ 3779{
3469 struct btrfs_block_group_cache *cache; 3780 struct btrfs_block_group_cache *cache;
3470 struct btrfs_fs_info *info = root->fs_info; 3781 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3471 u64 total = num_bytes; 3783 u64 total = num_bytes;
3472 u64 old_val; 3784 u64 old_val;
3473 u64 byte_in_group; 3785 u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3486 cache = btrfs_lookup_block_group(info, bytenr); 3798 cache = btrfs_lookup_block_group(info, bytenr);
3487 if (!cache) 3799 if (!cache)
3488 return -1; 3800 return -1;
3801 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3802 BTRFS_BLOCK_GROUP_RAID1 |
3803 BTRFS_BLOCK_GROUP_RAID10))
3804 factor = 2;
3805 else
3806 factor = 1;
3489 byte_in_group = bytenr - cache->key.objectid; 3807 byte_in_group = bytenr - cache->key.objectid;
3490 WARN_ON(byte_in_group > cache->key.offset); 3808 WARN_ON(byte_in_group > cache->key.offset);
3491 3809
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3498 old_val += num_bytes; 3816 old_val += num_bytes;
3499 btrfs_set_block_group_used(&cache->item, old_val); 3817 btrfs_set_block_group_used(&cache->item, old_val);
3500 cache->reserved -= num_bytes; 3818 cache->reserved -= num_bytes;
3501 cache->space_info->bytes_used += num_bytes;
3502 cache->space_info->bytes_reserved -= num_bytes; 3819 cache->space_info->bytes_reserved -= num_bytes;
3503 if (cache->ro) 3820 cache->space_info->bytes_used += num_bytes;
3504 cache->space_info->bytes_readonly -= num_bytes; 3821 cache->space_info->disk_used += num_bytes * factor;
3505 spin_unlock(&cache->lock); 3822 spin_unlock(&cache->lock);
3506 spin_unlock(&cache->space_info->lock); 3823 spin_unlock(&cache->space_info->lock);
3507 } else { 3824 } else {
3508 old_val -= num_bytes; 3825 old_val -= num_bytes;
3509 cache->space_info->bytes_used -= num_bytes;
3510 if (cache->ro)
3511 cache->space_info->bytes_readonly += num_bytes;
3512 btrfs_set_block_group_used(&cache->item, old_val); 3826 btrfs_set_block_group_used(&cache->item, old_val);
3827 cache->pinned += num_bytes;
3828 cache->space_info->bytes_pinned += num_bytes;
3829 cache->space_info->bytes_used -= num_bytes;
3830 cache->space_info->disk_used -= num_bytes * factor;
3513 spin_unlock(&cache->lock); 3831 spin_unlock(&cache->lock);
3514 spin_unlock(&cache->space_info->lock); 3832 spin_unlock(&cache->space_info->lock);
3515 if (mark_free) {
3516 int ret;
3517
3518 ret = btrfs_discard_extent(root, bytenr,
3519 num_bytes);
3520 WARN_ON(ret);
3521 3833
3522 ret = btrfs_add_free_space(cache, bytenr, 3834 set_extent_dirty(info->pinned_extents,
3523 num_bytes); 3835 bytenr, bytenr + num_bytes - 1,
3524 WARN_ON(ret); 3836 GFP_NOFS | __GFP_NOFAIL);
3525 }
3526 } 3837 }
3527 btrfs_put_block_group(cache); 3838 btrfs_put_block_group(cache);
3528 total -= num_bytes; 3839 total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3546 return bytenr; 3857 return bytenr;
3547} 3858}
3548 3859
3549/* 3860static int pin_down_extent(struct btrfs_root *root,
3550 * this function must be called within transaction 3861 struct btrfs_block_group_cache *cache,
3551 */ 3862 u64 bytenr, u64 num_bytes, int reserved)
3552int btrfs_pin_extent(struct btrfs_root *root,
3553 u64 bytenr, u64 num_bytes, int reserved)
3554{ 3863{
3555 struct btrfs_fs_info *fs_info = root->fs_info;
3556 struct btrfs_block_group_cache *cache;
3557
3558 cache = btrfs_lookup_block_group(fs_info, bytenr);
3559 BUG_ON(!cache);
3560
3561 spin_lock(&cache->space_info->lock); 3864 spin_lock(&cache->space_info->lock);
3562 spin_lock(&cache->lock); 3865 spin_lock(&cache->lock);
3563 cache->pinned += num_bytes; 3866 cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
3569 spin_unlock(&cache->lock); 3872 spin_unlock(&cache->lock);
3570 spin_unlock(&cache->space_info->lock); 3873 spin_unlock(&cache->space_info->lock);
3571 3874
3572 btrfs_put_block_group(cache); 3875 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3876 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3877 return 0;
3878}
3573 3879
3574 set_extent_dirty(fs_info->pinned_extents, 3880/*
3575 bytenr, bytenr + num_bytes - 1, GFP_NOFS); 3881 * this function must be called within transaction
3882 */
3883int btrfs_pin_extent(struct btrfs_root *root,
3884 u64 bytenr, u64 num_bytes, int reserved)
3885{
3886 struct btrfs_block_group_cache *cache;
3887
3888 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3889 BUG_ON(!cache);
3890
3891 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3892
3893 btrfs_put_block_group(cache);
3576 return 0; 3894 return 0;
3577} 3895}
3578 3896
3579static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3897/*
3580 u64 num_bytes, int reserve) 3898 * update size of reserved extents. this function may return -EAGAIN
3899 * if 'reserve' is true or 'sinfo' is false.
3900 */
3901static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3902 u64 num_bytes, int reserve, int sinfo)
3581{ 3903{
3582 spin_lock(&cache->space_info->lock); 3904 int ret = 0;
3583 spin_lock(&cache->lock); 3905 if (sinfo) {
3584 if (reserve) { 3906 struct btrfs_space_info *space_info = cache->space_info;
3585 cache->reserved += num_bytes; 3907 spin_lock(&space_info->lock);
3586 cache->space_info->bytes_reserved += num_bytes; 3908 spin_lock(&cache->lock);
3909 if (reserve) {
3910 if (cache->ro) {
3911 ret = -EAGAIN;
3912 } else {
3913 cache->reserved += num_bytes;
3914 space_info->bytes_reserved += num_bytes;
3915 }
3916 } else {
3917 if (cache->ro)
3918 space_info->bytes_readonly += num_bytes;
3919 cache->reserved -= num_bytes;
3920 space_info->bytes_reserved -= num_bytes;
3921 }
3922 spin_unlock(&cache->lock);
3923 spin_unlock(&space_info->lock);
3587 } else { 3924 } else {
3588 cache->reserved -= num_bytes; 3925 spin_lock(&cache->lock);
3589 cache->space_info->bytes_reserved -= num_bytes; 3926 if (cache->ro) {
3927 ret = -EAGAIN;
3928 } else {
3929 if (reserve)
3930 cache->reserved += num_bytes;
3931 else
3932 cache->reserved -= num_bytes;
3933 }
3934 spin_unlock(&cache->lock);
3590 } 3935 }
3591 spin_unlock(&cache->lock); 3936 return ret;
3592 spin_unlock(&cache->space_info->lock);
3593 return 0;
3594} 3937}
3595 3938
3596int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3939int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3621 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3964 fs_info->pinned_extents = &fs_info->freed_extents[0];
3622 3965
3623 up_write(&fs_info->extent_commit_sem); 3966 up_write(&fs_info->extent_commit_sem);
3967
3968 update_global_block_rsv(fs_info);
3624 return 0; 3969 return 0;
3625} 3970}
3626 3971
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3647 btrfs_add_free_space(cache, start, len); 3992 btrfs_add_free_space(cache, start, len);
3648 } 3993 }
3649 3994
3995 start += len;
3996
3650 spin_lock(&cache->space_info->lock); 3997 spin_lock(&cache->space_info->lock);
3651 spin_lock(&cache->lock); 3998 spin_lock(&cache->lock);
3652 cache->pinned -= len; 3999 cache->pinned -= len;
3653 cache->space_info->bytes_pinned -= len; 4000 cache->space_info->bytes_pinned -= len;
4001 if (cache->ro) {
4002 cache->space_info->bytes_readonly += len;
4003 } else if (cache->reserved_pinned > 0) {
4004 len = min(len, cache->reserved_pinned);
4005 cache->reserved_pinned -= len;
4006 cache->space_info->bytes_reserved += len;
4007 }
3654 spin_unlock(&cache->lock); 4008 spin_unlock(&cache->lock);
3655 spin_unlock(&cache->space_info->lock); 4009 spin_unlock(&cache->space_info->lock);
3656
3657 start += len;
3658 } 4010 }
3659 4011
3660 if (cache) 4012 if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3667{ 4019{
3668 struct btrfs_fs_info *fs_info = root->fs_info; 4020 struct btrfs_fs_info *fs_info = root->fs_info;
3669 struct extent_io_tree *unpin; 4021 struct extent_io_tree *unpin;
4022 struct btrfs_block_rsv *block_rsv;
4023 struct btrfs_block_rsv *next_rsv;
3670 u64 start; 4024 u64 start;
3671 u64 end; 4025 u64 end;
4026 int idx;
3672 int ret; 4027 int ret;
3673 4028
3674 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4029 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3689 cond_resched(); 4044 cond_resched();
3690 } 4045 }
3691 4046
3692 return ret; 4047 mutex_lock(&fs_info->durable_block_rsv_mutex);
3693} 4048 list_for_each_entry_safe(block_rsv, next_rsv,
3694 4049 &fs_info->durable_block_rsv_list, list) {
3695static int pin_down_bytes(struct btrfs_trans_handle *trans,
3696 struct btrfs_root *root,
3697 struct btrfs_path *path,
3698 u64 bytenr, u64 num_bytes,
3699 int is_data, int reserved,
3700 struct extent_buffer **must_clean)
3701{
3702 int err = 0;
3703 struct extent_buffer *buf;
3704 4050
3705 if (is_data) 4051 idx = trans->transid & 0x1;
3706 goto pinit; 4052 if (block_rsv->freed[idx] > 0) {
3707 4053 block_rsv_add_bytes(block_rsv,
3708 /* 4054 block_rsv->freed[idx], 0);
3709 * discard is sloooow, and so triggering discards on 4055 block_rsv->freed[idx] = 0;
3710 * individual btree blocks isn't a good plan. Just 4056 }
3711 * pin everything in discard mode. 4057 if (atomic_read(&block_rsv->usage) == 0) {
3712 */ 4058 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3713 if (btrfs_test_opt(root, DISCARD))
3714 goto pinit;
3715
3716 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3717 if (!buf)
3718 goto pinit;
3719 4059
3720 /* we can reuse a block if it hasn't been written 4060 if (block_rsv->freed[0] == 0 &&
3721 * and it is from this transaction. We can't 4061 block_rsv->freed[1] == 0) {
3722 * reuse anything from the tree log root because 4062 list_del_init(&block_rsv->list);
3723 * it has tiny sub-transactions. 4063 kfree(block_rsv);
3724 */ 4064 }
3725 if (btrfs_buffer_uptodate(buf, 0) && 4065 } else {
3726 btrfs_try_tree_lock(buf)) { 4066 btrfs_block_rsv_release(root, block_rsv, 0);
3727 u64 header_owner = btrfs_header_owner(buf);
3728 u64 header_transid = btrfs_header_generation(buf);
3729 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3730 header_transid == trans->transid &&
3731 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3732 *must_clean = buf;
3733 return 1;
3734 } 4067 }
3735 btrfs_tree_unlock(buf);
3736 } 4068 }
3737 free_extent_buffer(buf); 4069 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3738pinit:
3739 if (path)
3740 btrfs_set_path_blocking(path);
3741 /* unlocks the pinned mutex */
3742 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3743 4070
3744 BUG_ON(err < 0);
3745 return 0; 4071 return 0;
3746} 4072}
3747 4073
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3902 BUG_ON(ret); 4228 BUG_ON(ret);
3903 } 4229 }
3904 } else { 4230 } else {
3905 int mark_free = 0;
3906 struct extent_buffer *must_clean = NULL;
3907
3908 if (found_extent) { 4231 if (found_extent) {
3909 BUG_ON(is_data && refs_to_drop != 4232 BUG_ON(is_data && refs_to_drop !=
3910 extent_data_ref_count(root, path, iref)); 4233 extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3917 } 4240 }
3918 } 4241 }
3919 4242
3920 ret = pin_down_bytes(trans, root, path, bytenr,
3921 num_bytes, is_data, 0, &must_clean);
3922 if (ret > 0)
3923 mark_free = 1;
3924 BUG_ON(ret < 0);
3925 /*
3926 * it is going to be very rare for someone to be waiting
3927 * on the block we're freeing. del_items might need to
3928 * schedule, so rather than get fancy, just force it
3929 * to blocking here
3930 */
3931 if (must_clean)
3932 btrfs_set_lock_blocking(must_clean);
3933
3934 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4243 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3935 num_to_del); 4244 num_to_del);
3936 BUG_ON(ret); 4245 BUG_ON(ret);
3937 btrfs_release_path(extent_root, path); 4246 btrfs_release_path(extent_root, path);
3938 4247
3939 if (must_clean) {
3940 clean_tree_block(NULL, root, must_clean);
3941 btrfs_tree_unlock(must_clean);
3942 free_extent_buffer(must_clean);
3943 }
3944
3945 if (is_data) { 4248 if (is_data) {
3946 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4249 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3947 BUG_ON(ret); 4250 BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3951 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4254 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3952 } 4255 }
3953 4256
3954 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 4257 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
3955 mark_free);
3956 BUG_ON(ret); 4258 BUG_ON(ret);
3957 } 4259 }
3958 btrfs_free_path(path); 4260 btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3960} 4262}
3961 4263
3962/* 4264/*
3963 * when we free an extent, it is possible (and likely) that we free the last 4265 * when we free an block, it is possible (and likely) that we free the last
3964 * delayed ref for that extent as well. This searches the delayed ref tree for 4266 * delayed ref for that extent as well. This searches the delayed ref tree for
3965 * a given extent, and if there are no other delayed refs to be processed, it 4267 * a given extent, and if there are no other delayed refs to be processed, it
3966 * removes it from the tree. 4268 * removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3972 struct btrfs_delayed_ref_root *delayed_refs; 4274 struct btrfs_delayed_ref_root *delayed_refs;
3973 struct btrfs_delayed_ref_node *ref; 4275 struct btrfs_delayed_ref_node *ref;
3974 struct rb_node *node; 4276 struct rb_node *node;
3975 int ret; 4277 int ret = 0;
3976 4278
3977 delayed_refs = &trans->transaction->delayed_refs; 4279 delayed_refs = &trans->transaction->delayed_refs;
3978 spin_lock(&delayed_refs->lock); 4280 spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4024 list_del_init(&head->cluster); 4326 list_del_init(&head->cluster);
4025 spin_unlock(&delayed_refs->lock); 4327 spin_unlock(&delayed_refs->lock);
4026 4328
4027 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4329 BUG_ON(head->extent_op);
4028 &head->node, head->extent_op, 4330 if (head->must_insert_reserved)
4029 head->must_insert_reserved); 4331 ret = 1;
4030 BUG_ON(ret); 4332
4333 mutex_unlock(&head->mutex);
4031 btrfs_put_delayed_ref(&head->node); 4334 btrfs_put_delayed_ref(&head->node);
4032 return 0; 4335 return ret;
4033out: 4336out:
4034 spin_unlock(&delayed_refs->lock); 4337 spin_unlock(&delayed_refs->lock);
4035 return 0; 4338 return 0;
4036} 4339}
4037 4340
4341void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4342 struct btrfs_root *root,
4343 struct extent_buffer *buf,
4344 u64 parent, int last_ref)
4345{
4346 struct btrfs_block_rsv *block_rsv;
4347 struct btrfs_block_group_cache *cache = NULL;
4348 int ret;
4349
4350 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4351 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4352 parent, root->root_key.objectid,
4353 btrfs_header_level(buf),
4354 BTRFS_DROP_DELAYED_REF, NULL);
4355 BUG_ON(ret);
4356 }
4357
4358 if (!last_ref)
4359 return;
4360
4361 block_rsv = get_block_rsv(trans, root);
4362 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4363 if (block_rsv->space_info != cache->space_info)
4364 goto out;
4365
4366 if (btrfs_header_generation(buf) == trans->transid) {
4367 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4368 ret = check_ref_cleanup(trans, root, buf->start);
4369 if (!ret)
4370 goto pin;
4371 }
4372
4373 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4374 pin_down_extent(root, cache, buf->start, buf->len, 1);
4375 goto pin;
4376 }
4377
4378 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4379
4380 btrfs_add_free_space(cache, buf->start, buf->len);
4381 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4382 if (ret == -EAGAIN) {
4383 /* block group became read-only */
4384 update_reserved_bytes(cache, buf->len, 0, 1);
4385 goto out;
4386 }
4387
4388 ret = 1;
4389 spin_lock(&block_rsv->lock);
4390 if (block_rsv->reserved < block_rsv->size) {
4391 block_rsv->reserved += buf->len;
4392 ret = 0;
4393 }
4394 spin_unlock(&block_rsv->lock);
4395
4396 if (ret) {
4397 spin_lock(&cache->space_info->lock);
4398 cache->space_info->bytes_reserved -= buf->len;
4399 spin_unlock(&cache->space_info->lock);
4400 }
4401 goto out;
4402 }
4403pin:
4404 if (block_rsv->durable && !cache->ro) {
4405 ret = 0;
4406 spin_lock(&cache->lock);
4407 if (!cache->ro) {
4408 cache->reserved_pinned += buf->len;
4409 ret = 1;
4410 }
4411 spin_unlock(&cache->lock);
4412
4413 if (ret) {
4414 spin_lock(&block_rsv->lock);
4415 block_rsv->freed[trans->transid & 0x1] += buf->len;
4416 spin_unlock(&block_rsv->lock);
4417 }
4418 }
4419out:
4420 btrfs_put_block_group(cache);
4421}
4422
4038int btrfs_free_extent(struct btrfs_trans_handle *trans, 4423int btrfs_free_extent(struct btrfs_trans_handle *trans,
4039 struct btrfs_root *root, 4424 struct btrfs_root *root,
4040 u64 bytenr, u64 num_bytes, u64 parent, 4425 u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4056 parent, root_objectid, (int)owner, 4441 parent, root_objectid, (int)owner,
4057 BTRFS_DROP_DELAYED_REF, NULL); 4442 BTRFS_DROP_DELAYED_REF, NULL);
4058 BUG_ON(ret); 4443 BUG_ON(ret);
4059 ret = check_ref_cleanup(trans, root, bytenr);
4060 BUG_ON(ret);
4061 } else { 4444 } else {
4062 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4445 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4063 parent, root_objectid, owner, 4446 parent, root_objectid, owner,
@@ -4067,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4067 return ret; 4450 return ret;
4068} 4451}
4069 4452
4070int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4071 struct btrfs_root *root,
4072 u64 bytenr, u32 blocksize,
4073 u64 parent, u64 root_objectid, int level)
4074{
4075 u64 used;
4076 spin_lock(&root->node_lock);
4077 used = btrfs_root_used(&root->root_item) - blocksize;
4078 btrfs_set_root_used(&root->root_item, used);
4079 spin_unlock(&root->node_lock);
4080
4081 return btrfs_free_extent(trans, root, bytenr, blocksize,
4082 parent, root_objectid, level, 0);
4083}
4084
4085static u64 stripe_align(struct btrfs_root *root, u64 val) 4453static u64 stripe_align(struct btrfs_root *root, u64 val)
4086{ 4454{
4087 u64 mask = ((u64)root->stripesize - 1); 4455 u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4134 return 0; 4502 return 0;
4135} 4503}
4136 4504
4505static int get_block_group_index(struct btrfs_block_group_cache *cache)
4506{
4507 int index;
4508 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4509 index = 0;
4510 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4511 index = 1;
4512 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4513 index = 2;
4514 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4515 index = 3;
4516 else
4517 index = 4;
4518 return index;
4519}
4520
4137enum btrfs_loop_type { 4521enum btrfs_loop_type {
4138 LOOP_FIND_IDEAL = 0, 4522 LOOP_FIND_IDEAL = 0,
4139 LOOP_CACHING_NOWAIT = 1, 4523 LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4155 u64 num_bytes, u64 empty_size, 4539 u64 num_bytes, u64 empty_size,
4156 u64 search_start, u64 search_end, 4540 u64 search_start, u64 search_end,
4157 u64 hint_byte, struct btrfs_key *ins, 4541 u64 hint_byte, struct btrfs_key *ins,
4158 u64 exclude_start, u64 exclude_nr,
4159 int data) 4542 int data)
4160{ 4543{
4161 int ret = 0; 4544 int ret = 0;
@@ -4168,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4168 struct btrfs_space_info *space_info; 4551 struct btrfs_space_info *space_info;
4169 int last_ptr_loop = 0; 4552 int last_ptr_loop = 0;
4170 int loop = 0; 4553 int loop = 0;
4554 int index = 0;
4171 bool found_uncached_bg = false; 4555 bool found_uncached_bg = false;
4172 bool failed_cluster_refill = false; 4556 bool failed_cluster_refill = false;
4173 bool failed_alloc = false; 4557 bool failed_alloc = false;
@@ -4237,6 +4621,7 @@ ideal_cache:
4237 btrfs_put_block_group(block_group); 4621 btrfs_put_block_group(block_group);
4238 up_read(&space_info->groups_sem); 4622 up_read(&space_info->groups_sem);
4239 } else { 4623 } else {
4624 index = get_block_group_index(block_group);
4240 goto have_block_group; 4625 goto have_block_group;
4241 } 4626 }
4242 } else if (block_group) { 4627 } else if (block_group) {
@@ -4245,7 +4630,8 @@ ideal_cache:
4245 } 4630 }
4246search: 4631search:
4247 down_read(&space_info->groups_sem); 4632 down_read(&space_info->groups_sem);
4248 list_for_each_entry(block_group, &space_info->block_groups, list) { 4633 list_for_each_entry(block_group, &space_info->block_groups[index],
4634 list) {
4249 u64 offset; 4635 u64 offset;
4250 int cached; 4636 int cached;
4251 4637
@@ -4436,23 +4822,22 @@ checks:
4436 goto loop; 4822 goto loop;
4437 } 4823 }
4438 4824
4439 if (exclude_nr > 0 && 4825 ins->objectid = search_start;
4440 (search_start + num_bytes > exclude_start && 4826 ins->offset = num_bytes;
4441 search_start < exclude_start + exclude_nr)) { 4827
4442 search_start = exclude_start + exclude_nr; 4828 if (offset < search_start)
4829 btrfs_add_free_space(block_group, offset,
4830 search_start - offset);
4831 BUG_ON(offset > search_start);
4443 4832
4833 ret = update_reserved_bytes(block_group, num_bytes, 1,
4834 (data & BTRFS_BLOCK_GROUP_DATA));
4835 if (ret == -EAGAIN) {
4444 btrfs_add_free_space(block_group, offset, num_bytes); 4836 btrfs_add_free_space(block_group, offset, num_bytes);
4445 /*
4446 * if search_start is still in this block group
4447 * then we just re-search this block group
4448 */
4449 if (search_start >= block_group->key.objectid &&
4450 search_start < (block_group->key.objectid +
4451 block_group->key.offset))
4452 goto have_block_group;
4453 goto loop; 4837 goto loop;
4454 } 4838 }
4455 4839
4840 /* we are all good, lets return */
4456 ins->objectid = search_start; 4841 ins->objectid = search_start;
4457 ins->offset = num_bytes; 4842 ins->offset = num_bytes;
4458 4843
@@ -4460,18 +4845,18 @@ checks:
4460 btrfs_add_free_space(block_group, offset, 4845 btrfs_add_free_space(block_group, offset,
4461 search_start - offset); 4846 search_start - offset);
4462 BUG_ON(offset > search_start); 4847 BUG_ON(offset > search_start);
4463
4464 update_reserved_extents(block_group, num_bytes, 1);
4465
4466 /* we are all good, lets return */
4467 break; 4848 break;
4468loop: 4849loop:
4469 failed_cluster_refill = false; 4850 failed_cluster_refill = false;
4470 failed_alloc = false; 4851 failed_alloc = false;
4852 BUG_ON(index != get_block_group_index(block_group));
4471 btrfs_put_block_group(block_group); 4853 btrfs_put_block_group(block_group);
4472 } 4854 }
4473 up_read(&space_info->groups_sem); 4855 up_read(&space_info->groups_sem);
4474 4856
4857 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4858 goto search;
4859
4475 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4860 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4476 * for them to make caching progress. Also 4861 * for them to make caching progress. Also
4477 * determine the best possible bg to cache 4862 * determine the best possible bg to cache
@@ -4485,6 +4870,7 @@ loop:
4485 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4870 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4486 (found_uncached_bg || empty_size || empty_cluster || 4871 (found_uncached_bg || empty_size || empty_cluster ||
4487 allowed_chunk_alloc)) { 4872 allowed_chunk_alloc)) {
4873 index = 0;
4488 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4874 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4489 found_uncached_bg = false; 4875 found_uncached_bg = false;
4490 loop++; 4876 loop++;
@@ -4567,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4567 int dump_block_groups) 4953 int dump_block_groups)
4568{ 4954{
4569 struct btrfs_block_group_cache *cache; 4955 struct btrfs_block_group_cache *cache;
4956 int index = 0;
4570 4957
4571 spin_lock(&info->lock); 4958 spin_lock(&info->lock);
4572 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4959 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4573 (unsigned long long)(info->total_bytes - info->bytes_used - 4960 (unsigned long long)(info->total_bytes - info->bytes_used -
4574 info->bytes_pinned - info->bytes_reserved - 4961 info->bytes_pinned - info->bytes_reserved -
4575 info->bytes_super), 4962 info->bytes_readonly),
4576 (info->full) ? "" : "not "); 4963 (info->full) ? "" : "not ");
4577 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4964 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
4578 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4965 "reserved=%llu, may_use=%llu, readonly=%llu\n",
4579 "\n",
4580 (unsigned long long)info->total_bytes, 4966 (unsigned long long)info->total_bytes,
4967 (unsigned long long)info->bytes_used,
4581 (unsigned long long)info->bytes_pinned, 4968 (unsigned long long)info->bytes_pinned,
4582 (unsigned long long)info->bytes_delalloc, 4969 (unsigned long long)info->bytes_reserved,
4583 (unsigned long long)info->bytes_may_use, 4970 (unsigned long long)info->bytes_may_use,
4584 (unsigned long long)info->bytes_used, 4971 (unsigned long long)info->bytes_readonly);
4585 (unsigned long long)info->bytes_root,
4586 (unsigned long long)info->bytes_super,
4587 (unsigned long long)info->bytes_reserved);
4588 spin_unlock(&info->lock); 4972 spin_unlock(&info->lock);
4589 4973
4590 if (!dump_block_groups) 4974 if (!dump_block_groups)
4591 return; 4975 return;
4592 4976
4593 down_read(&info->groups_sem); 4977 down_read(&info->groups_sem);
4594 list_for_each_entry(cache, &info->block_groups, list) { 4978again:
4979 list_for_each_entry(cache, &info->block_groups[index], list) {
4595 spin_lock(&cache->lock); 4980 spin_lock(&cache->lock);
4596 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4981 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4597 "%llu pinned %llu reserved\n", 4982 "%llu pinned %llu reserved\n",
@@ -4603,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4603 btrfs_dump_free_space(cache, bytes); 4988 btrfs_dump_free_space(cache, bytes);
4604 spin_unlock(&cache->lock); 4989 spin_unlock(&cache->lock);
4605 } 4990 }
4991 if (++index < BTRFS_NR_RAID_TYPES)
4992 goto again;
4606 up_read(&info->groups_sem); 4993 up_read(&info->groups_sem);
4607} 4994}
4608 4995
@@ -4628,9 +5015,8 @@ again:
4628 5015
4629 WARN_ON(num_bytes < root->sectorsize); 5016 WARN_ON(num_bytes < root->sectorsize);
4630 ret = find_free_extent(trans, root, num_bytes, empty_size, 5017 ret = find_free_extent(trans, root, num_bytes, empty_size,
4631 search_start, search_end, hint_byte, ins, 5018 search_start, search_end, hint_byte,
4632 trans->alloc_exclude_start, 5019 ins, data);
4633 trans->alloc_exclude_nr, data);
4634 5020
4635 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5021 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4636 num_bytes = num_bytes >> 1; 5022 num_bytes = num_bytes >> 1;
@@ -4668,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4668 ret = btrfs_discard_extent(root, start, len); 5054 ret = btrfs_discard_extent(root, start, len);
4669 5055
4670 btrfs_add_free_space(cache, start, len); 5056 btrfs_add_free_space(cache, start, len);
4671 update_reserved_extents(cache, len, 0); 5057 update_reserved_bytes(cache, len, 0, 1);
4672 btrfs_put_block_group(cache); 5058 btrfs_put_block_group(cache);
4673 5059
4674 return ret; 5060 return ret;
@@ -4731,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4731 btrfs_mark_buffer_dirty(path->nodes[0]); 5117 btrfs_mark_buffer_dirty(path->nodes[0]);
4732 btrfs_free_path(path); 5118 btrfs_free_path(path);
4733 5119
4734 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5120 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4735 1, 0);
4736 if (ret) { 5121 if (ret) {
4737 printk(KERN_ERR "btrfs update block group failed for %llu " 5122 printk(KERN_ERR "btrfs update block group failed for %llu "
4738 "%llu\n", (unsigned long long)ins->objectid, 5123 "%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4792 btrfs_mark_buffer_dirty(leaf); 5177 btrfs_mark_buffer_dirty(leaf);
4793 btrfs_free_path(path); 5178 btrfs_free_path(path);
4794 5179
4795 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5180 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4796 1, 0);
4797 if (ret) { 5181 if (ret) {
4798 printk(KERN_ERR "btrfs update block group failed for %llu " 5182 printk(KERN_ERR "btrfs update block group failed for %llu "
4799 "%llu\n", (unsigned long long)ins->objectid, 5183 "%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4869 put_caching_control(caching_ctl); 5253 put_caching_control(caching_ctl);
4870 } 5254 }
4871 5255
4872 update_reserved_extents(block_group, ins->offset, 1); 5256 ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5257 BUG_ON(ret);
4873 btrfs_put_block_group(block_group); 5258 btrfs_put_block_group(block_group);
4874 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5259 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4875 0, owner, offset, ins, 1); 5260 0, owner, offset, ins, 1);
4876 return ret; 5261 return ret;
4877} 5262}
4878 5263
4879/*
4880 * finds a free extent and does all the dirty work required for allocation
4881 * returns the key for the extent through ins, and a tree buffer for
4882 * the first block of the extent through buf.
4883 *
4884 * returns 0 if everything worked, non-zero otherwise.
4885 */
4886static int alloc_tree_block(struct btrfs_trans_handle *trans,
4887 struct btrfs_root *root,
4888 u64 num_bytes, u64 parent, u64 root_objectid,
4889 struct btrfs_disk_key *key, int level,
4890 u64 empty_size, u64 hint_byte, u64 search_end,
4891 struct btrfs_key *ins)
4892{
4893 int ret;
4894 u64 flags = 0;
4895
4896 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4897 empty_size, hint_byte, search_end,
4898 ins, 0);
4899 if (ret)
4900 return ret;
4901
4902 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4903 if (parent == 0)
4904 parent = ins->objectid;
4905 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4906 } else
4907 BUG_ON(parent > 0);
4908
4909 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4910 struct btrfs_delayed_extent_op *extent_op;
4911 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4912 BUG_ON(!extent_op);
4913 if (key)
4914 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4915 else
4916 memset(&extent_op->key, 0, sizeof(extent_op->key));
4917 extent_op->flags_to_set = flags;
4918 extent_op->update_key = 1;
4919 extent_op->update_flags = 1;
4920 extent_op->is_data = 0;
4921
4922 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4923 ins->offset, parent, root_objectid,
4924 level, BTRFS_ADD_DELAYED_EXTENT,
4925 extent_op);
4926 BUG_ON(ret);
4927 }
4928
4929 if (root_objectid == root->root_key.objectid) {
4930 u64 used;
4931 spin_lock(&root->node_lock);
4932 used = btrfs_root_used(&root->root_item) + num_bytes;
4933 btrfs_set_root_used(&root->root_item, used);
4934 spin_unlock(&root->node_lock);
4935 }
4936 return ret;
4937}
4938
4939struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5264struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4940 struct btrfs_root *root, 5265 struct btrfs_root *root,
4941 u64 bytenr, u32 blocksize, 5266 u64 bytenr, u32 blocksize,
@@ -4974,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4974 return buf; 5299 return buf;
4975} 5300}
4976 5301
5302static struct btrfs_block_rsv *
5303use_block_rsv(struct btrfs_trans_handle *trans,
5304 struct btrfs_root *root, u32 blocksize)
5305{
5306 struct btrfs_block_rsv *block_rsv;
5307 int ret;
5308
5309 block_rsv = get_block_rsv(trans, root);
5310
5311 if (block_rsv->size == 0) {
5312 ret = reserve_metadata_bytes(block_rsv, blocksize);
5313 if (ret)
5314 return ERR_PTR(ret);
5315 return block_rsv;
5316 }
5317
5318 ret = block_rsv_use_bytes(block_rsv, blocksize);
5319 if (!ret)
5320 return block_rsv;
5321
5322 WARN_ON(1);
5323 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5324 block_rsv->size, block_rsv->reserved,
5325 block_rsv->freed[0], block_rsv->freed[1]);
5326
5327 return ERR_PTR(-ENOSPC);
5328}
5329
5330static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5331{
5332 block_rsv_add_bytes(block_rsv, blocksize, 0);
5333 block_rsv_release_bytes(block_rsv, NULL, 0);
5334}
5335
4977/* 5336/*
4978 * helper function to allocate a block for a given tree 5337 * finds a free extent and does all the dirty work required for allocation
5338 * returns the key for the extent through ins, and a tree buffer for
5339 * the first block of the extent through buf.
5340 *
4979 * returns the tree buffer or NULL. 5341 * returns the tree buffer or NULL.
4980 */ 5342 */
4981struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5343struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4985 u64 hint, u64 empty_size) 5347 u64 hint, u64 empty_size)
4986{ 5348{
4987 struct btrfs_key ins; 5349 struct btrfs_key ins;
4988 int ret; 5350 struct btrfs_block_rsv *block_rsv;
4989 struct extent_buffer *buf; 5351 struct extent_buffer *buf;
5352 u64 flags = 0;
5353 int ret;
5354
4990 5355
4991 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 5356 block_rsv = use_block_rsv(trans, root, blocksize);
4992 key, level, empty_size, hint, (u64)-1, &ins); 5357 if (IS_ERR(block_rsv))
5358 return ERR_CAST(block_rsv);
5359
5360 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5361 empty_size, hint, (u64)-1, &ins, 0);
4993 if (ret) { 5362 if (ret) {
4994 BUG_ON(ret > 0); 5363 unuse_block_rsv(block_rsv, blocksize);
4995 return ERR_PTR(ret); 5364 return ERR_PTR(ret);
4996 } 5365 }
4997 5366
4998 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5367 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4999 blocksize, level); 5368 blocksize, level);
5369 BUG_ON(IS_ERR(buf));
5370
5371 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5372 if (parent == 0)
5373 parent = ins.objectid;
5374 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5375 } else
5376 BUG_ON(parent > 0);
5377
5378 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5379 struct btrfs_delayed_extent_op *extent_op;
5380 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5381 BUG_ON(!extent_op);
5382 if (key)
5383 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5384 else
5385 memset(&extent_op->key, 0, sizeof(extent_op->key));
5386 extent_op->flags_to_set = flags;
5387 extent_op->update_key = 1;
5388 extent_op->update_flags = 1;
5389 extent_op->is_data = 0;
5390
5391 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5392 ins.offset, parent, root_objectid,
5393 level, BTRFS_ADD_DELAYED_EXTENT,
5394 extent_op);
5395 BUG_ON(ret);
5396 }
5000 return buf; 5397 return buf;
5001} 5398}
5002 5399
@@ -5321,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5321 struct btrfs_path *path, 5718 struct btrfs_path *path,
5322 struct walk_control *wc) 5719 struct walk_control *wc)
5323{ 5720{
5324 int ret = 0; 5721 int ret;
5325 int level = wc->level; 5722 int level = wc->level;
5326 struct extent_buffer *eb = path->nodes[level]; 5723 struct extent_buffer *eb = path->nodes[level];
5327 u64 parent = 0; 5724 u64 parent = 0;
@@ -5399,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5399 btrfs_header_owner(path->nodes[level + 1])); 5796 btrfs_header_owner(path->nodes[level + 1]));
5400 } 5797 }
5401 5798
5402 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5799 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5403 root->root_key.objectid, level, 0);
5404 BUG_ON(ret);
5405out: 5800out:
5406 wc->refs[level] = 0; 5801 wc->refs[level] = 0;
5407 wc->flags[level] = 0; 5802 wc->flags[level] = 0;
5408 return ret; 5803 return 0;
5409} 5804}
5410 5805
5411static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5806static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5483 * also make sure backrefs for the shared block and all lower level 5878 * also make sure backrefs for the shared block and all lower level
5484 * blocks are properly updated. 5879 * blocks are properly updated.
5485 */ 5880 */
5486int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5881int btrfs_drop_snapshot(struct btrfs_root *root,
5882 struct btrfs_block_rsv *block_rsv, int update_ref)
5487{ 5883{
5488 struct btrfs_path *path; 5884 struct btrfs_path *path;
5489 struct btrfs_trans_handle *trans; 5885 struct btrfs_trans_handle *trans;
@@ -5501,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5501 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5897 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5502 BUG_ON(!wc); 5898 BUG_ON(!wc);
5503 5899
5504 trans = btrfs_start_transaction(tree_root, 1); 5900 trans = btrfs_start_transaction(tree_root, 0);
5901 if (block_rsv)
5902 trans->block_rsv = block_rsv;
5505 5903
5506 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5904 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5507 level = btrfs_header_level(root->node); 5905 level = btrfs_header_level(root->node);
@@ -5589,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5589 } 5987 }
5590 5988
5591 BUG_ON(wc->level == 0); 5989 BUG_ON(wc->level == 0);
5592 if (trans->transaction->in_commit || 5990 if (btrfs_should_end_transaction(trans, tree_root)) {
5593 trans->transaction->delayed_refs.flushing) {
5594 ret = btrfs_update_root(trans, tree_root, 5991 ret = btrfs_update_root(trans, tree_root,
5595 &root->root_key, 5992 &root->root_key,
5596 root_item); 5993 root_item);
5597 BUG_ON(ret); 5994 BUG_ON(ret);
5598 5995
5599 btrfs_end_transaction(trans, tree_root); 5996 btrfs_end_transaction_throttle(trans, tree_root);
5600 trans = btrfs_start_transaction(tree_root, 1); 5997 trans = btrfs_start_transaction(tree_root, 0);
5601 } else { 5998 if (block_rsv)
5602 unsigned long update; 5999 trans->block_rsv = block_rsv;
5603 update = trans->delayed_ref_updates;
5604 trans->delayed_ref_updates = 0;
5605 if (update)
5606 btrfs_run_delayed_refs(trans, tree_root,
5607 update);
5608 } 6000 }
5609 } 6001 }
5610 btrfs_release_path(root, path); 6002 btrfs_release_path(root, path);
@@ -5632,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5632 kfree(root); 6024 kfree(root);
5633 } 6025 }
5634out: 6026out:
5635 btrfs_end_transaction(trans, tree_root); 6027 btrfs_end_transaction_throttle(trans, tree_root);
5636 kfree(wc); 6028 kfree(wc);
5637 btrfs_free_path(path); 6029 btrfs_free_path(path);
5638 return err; 6030 return err;
@@ -7228,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7228 return flags; 7620 return flags;
7229} 7621}
7230 7622
7231static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7623static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7232 struct btrfs_block_group_cache *shrink_block_group,
7233 int force)
7234{ 7624{
7235 struct btrfs_trans_handle *trans; 7625 struct btrfs_space_info *sinfo = cache->space_info;
7236 u64 new_alloc_flags; 7626 u64 num_bytes;
7237 u64 calc; 7627 int ret = -ENOSPC;
7238 7628
7239 spin_lock(&shrink_block_group->lock); 7629 if (cache->ro)
7240 if (btrfs_block_group_used(&shrink_block_group->item) + 7630 return 0;
7241 shrink_block_group->reserved > 0) {
7242 spin_unlock(&shrink_block_group->lock);
7243 7631
7244 trans = btrfs_start_transaction(root, 1); 7632 spin_lock(&sinfo->lock);
7245 spin_lock(&shrink_block_group->lock); 7633 spin_lock(&cache->lock);
7634 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7635 cache->bytes_super - btrfs_block_group_used(&cache->item);
7636
7637 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7638 sinfo->bytes_may_use + sinfo->bytes_readonly +
7639 cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7640 sinfo->bytes_readonly += num_bytes;
7641 sinfo->bytes_reserved += cache->reserved_pinned;
7642 cache->reserved_pinned = 0;
7643 cache->ro = 1;
7644 ret = 0;
7645 }
7646 spin_unlock(&cache->lock);
7647 spin_unlock(&sinfo->lock);
7648 return ret;
7649}
7246 7650
7247 new_alloc_flags = update_block_group_flags(root, 7651int btrfs_set_block_group_ro(struct btrfs_root *root,
7248 shrink_block_group->flags); 7652 struct btrfs_block_group_cache *cache)
7249 if (new_alloc_flags != shrink_block_group->flags) {
7250 calc =
7251 btrfs_block_group_used(&shrink_block_group->item);
7252 } else {
7253 calc = shrink_block_group->key.offset;
7254 }
7255 spin_unlock(&shrink_block_group->lock);
7256 7653
7257 do_chunk_alloc(trans, root->fs_info->extent_root, 7654{
7258 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7655 struct btrfs_trans_handle *trans;
7656 u64 alloc_flags;
7657 int ret;
7259 7658
7260 btrfs_end_transaction(trans, root); 7659 BUG_ON(cache->ro);
7261 } else 7660
7262 spin_unlock(&shrink_block_group->lock); 7661 trans = btrfs_join_transaction(root, 1);
7263 return 0; 7662 BUG_ON(IS_ERR(trans));
7264}
7265 7663
7664 alloc_flags = update_block_group_flags(root, cache->flags);
7665 if (alloc_flags != cache->flags)
7666 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7266 7667
7267int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7668 ret = set_block_group_ro(cache);
7268 struct btrfs_block_group_cache *group) 7669 if (!ret)
7670 goto out;
7671 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7672 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7673 if (ret < 0)
7674 goto out;
7675 ret = set_block_group_ro(cache);
7676out:
7677 btrfs_end_transaction(trans, root);
7678 return ret;
7679}
7269 7680
7681int btrfs_set_block_group_rw(struct btrfs_root *root,
7682 struct btrfs_block_group_cache *cache)
7270{ 7683{
7271 __alloc_chunk_for_shrink(root, group, 1); 7684 struct btrfs_space_info *sinfo = cache->space_info;
7272 set_block_group_readonly(group); 7685 u64 num_bytes;
7686
7687 BUG_ON(!cache->ro);
7688
7689 spin_lock(&sinfo->lock);
7690 spin_lock(&cache->lock);
7691 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7692 cache->bytes_super - btrfs_block_group_used(&cache->item);
7693 sinfo->bytes_readonly -= num_bytes;
7694 cache->ro = 0;
7695 spin_unlock(&cache->lock);
7696 spin_unlock(&sinfo->lock);
7273 return 0; 7697 return 0;
7274} 7698}
7275 7699
@@ -7436,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7436 */ 7860 */
7437 synchronize_rcu(); 7861 synchronize_rcu();
7438 7862
7863 release_global_block_rsv(info);
7864
7439 while(!list_empty(&info->space_info)) { 7865 while(!list_empty(&info->space_info)) {
7440 space_info = list_entry(info->space_info.next, 7866 space_info = list_entry(info->space_info.next,
7441 struct btrfs_space_info, 7867 struct btrfs_space_info,
7442 list); 7868 list);
7443 7869 if (space_info->bytes_pinned > 0 ||
7870 space_info->bytes_reserved > 0) {
7871 WARN_ON(1);
7872 dump_space_info(space_info, 0, 0);
7873 }
7444 list_del(&space_info->list); 7874 list_del(&space_info->list);
7445 kfree(space_info); 7875 kfree(space_info);
7446 } 7876 }
7447 return 0; 7877 return 0;
7448} 7878}
7449 7879
7880static void __link_block_group(struct btrfs_space_info *space_info,
7881 struct btrfs_block_group_cache *cache)
7882{
7883 int index = get_block_group_index(cache);
7884
7885 down_write(&space_info->groups_sem);
7886 list_add_tail(&cache->list, &space_info->block_groups[index]);
7887 up_write(&space_info->groups_sem);
7888}
7889
7450int btrfs_read_block_groups(struct btrfs_root *root) 7890int btrfs_read_block_groups(struct btrfs_root *root)
7451{ 7891{
7452 struct btrfs_path *path; 7892 struct btrfs_path *path;
@@ -7468,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7468 7908
7469 while (1) { 7909 while (1) {
7470 ret = find_first_block_group(root, path, &key); 7910 ret = find_first_block_group(root, path, &key);
7471 if (ret > 0) { 7911 if (ret > 0)
7472 ret = 0; 7912 break;
7473 goto error;
7474 }
7475 if (ret != 0) 7913 if (ret != 0)
7476 goto error; 7914 goto error;
7477 7915
@@ -7480,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7480 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7918 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7481 if (!cache) { 7919 if (!cache) {
7482 ret = -ENOMEM; 7920 ret = -ENOMEM;
7483 break; 7921 goto error;
7484 } 7922 }
7485 7923
7486 atomic_set(&cache->count, 1); 7924 atomic_set(&cache->count, 1);
@@ -7537,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7537 BUG_ON(ret); 7975 BUG_ON(ret);
7538 cache->space_info = space_info; 7976 cache->space_info = space_info;
7539 spin_lock(&cache->space_info->lock); 7977 spin_lock(&cache->space_info->lock);
7540 cache->space_info->bytes_super += cache->bytes_super; 7978 cache->space_info->bytes_readonly += cache->bytes_super;
7541 spin_unlock(&cache->space_info->lock); 7979 spin_unlock(&cache->space_info->lock);
7542 7980
7543 down_write(&space_info->groups_sem); 7981 __link_block_group(space_info, cache);
7544 list_add_tail(&cache->list, &space_info->block_groups);
7545 up_write(&space_info->groups_sem);
7546 7982
7547 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7983 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7548 BUG_ON(ret); 7984 BUG_ON(ret);
7549 7985
7550 set_avail_alloc_bits(root->fs_info, cache->flags); 7986 set_avail_alloc_bits(root->fs_info, cache->flags);
7551 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7987 if (btrfs_chunk_readonly(root, cache->key.objectid))
7552 set_block_group_readonly(cache); 7988 set_block_group_ro(cache);
7553 } 7989 }
7990
7991 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7992 if (!(get_alloc_profile(root, space_info->flags) &
7993 (BTRFS_BLOCK_GROUP_RAID10 |
7994 BTRFS_BLOCK_GROUP_RAID1 |
7995 BTRFS_BLOCK_GROUP_DUP)))
7996 continue;
7997 /*
7998 * avoid allocating from un-mirrored block group if there are
7999 * mirrored block groups.
8000 */
8001 list_for_each_entry(cache, &space_info->block_groups[3], list)
8002 set_block_group_ro(cache);
8003 list_for_each_entry(cache, &space_info->block_groups[4], list)
8004 set_block_group_ro(cache);
8005 }
8006
8007 init_global_block_rsv(info);
7554 ret = 0; 8008 ret = 0;
7555error: 8009error:
7556 btrfs_free_path(path); 8010 btrfs_free_path(path);
@@ -7611,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7611 BUG_ON(ret); 8065 BUG_ON(ret);
7612 8066
7613 spin_lock(&cache->space_info->lock); 8067 spin_lock(&cache->space_info->lock);
7614 cache->space_info->bytes_super += cache->bytes_super; 8068 cache->space_info->bytes_readonly += cache->bytes_super;
7615 spin_unlock(&cache->space_info->lock); 8069 spin_unlock(&cache->space_info->lock);
7616 8070
7617 down_write(&cache->space_info->groups_sem); 8071 __link_block_group(cache->space_info, cache);
7618 list_add_tail(&cache->list, &cache->space_info->block_groups);
7619 up_write(&cache->space_info->groups_sem);
7620 8072
7621 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8073 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7622 BUG_ON(ret); 8074 BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..a4080c21ec55 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
135 return state; 135 return state;
136} 136}
137 137
138static void free_extent_state(struct extent_state *state) 138void free_extent_state(struct extent_state *state)
139{ 139{
140 if (!state) 140 if (!state)
141 return; 141 return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
335} 335}
336 336
337static int set_state_cb(struct extent_io_tree *tree, 337static int set_state_cb(struct extent_io_tree *tree,
338 struct extent_state *state, 338 struct extent_state *state, int *bits)
339 unsigned long bits)
340{ 339{
341 if (tree->ops && tree->ops->set_bit_hook) { 340 if (tree->ops && tree->ops->set_bit_hook) {
342 return tree->ops->set_bit_hook(tree->mapping->host, 341 return tree->ops->set_bit_hook(tree->mapping->host,
343 state->start, state->end, 342 state, bits);
344 state->state, bits);
345 } 343 }
346 344
347 return 0; 345 return 0;
348} 346}
349 347
350static void clear_state_cb(struct extent_io_tree *tree, 348static void clear_state_cb(struct extent_io_tree *tree,
351 struct extent_state *state, 349 struct extent_state *state, int *bits)
352 unsigned long bits)
353{ 350{
354 if (tree->ops && tree->ops->clear_bit_hook) 351 if (tree->ops && tree->ops->clear_bit_hook)
355 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 352 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
367 */ 364 */
368static int insert_state(struct extent_io_tree *tree, 365static int insert_state(struct extent_io_tree *tree,
369 struct extent_state *state, u64 start, u64 end, 366 struct extent_state *state, u64 start, u64 end,
370 int bits) 367 int *bits)
371{ 368{
372 struct rb_node *node; 369 struct rb_node *node;
370 int bits_to_set = *bits & ~EXTENT_CTLBITS;
373 int ret; 371 int ret;
374 372
375 if (end < start) { 373 if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
384 if (ret) 382 if (ret)
385 return ret; 383 return ret;
386 384
387 if (bits & EXTENT_DIRTY) 385 if (bits_to_set & EXTENT_DIRTY)
388 tree->dirty_bytes += end - start + 1; 386 tree->dirty_bytes += end - start + 1;
389 state->state |= bits; 387 state->state |= bits_to_set;
390 node = tree_insert(&tree->state, end, &state->rb_node); 388 node = tree_insert(&tree->state, end, &state->rb_node);
391 if (node) { 389 if (node) {
392 struct extent_state *found; 390 struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
456 * struct is freed and removed from the tree 454 * struct is freed and removed from the tree
457 */ 455 */
458static int clear_state_bit(struct extent_io_tree *tree, 456static int clear_state_bit(struct extent_io_tree *tree,
459 struct extent_state *state, int bits, int wake, 457 struct extent_state *state,
460 int delete) 458 int *bits, int wake)
461{ 459{
462 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 460 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
463 int ret = state->state & bits_to_clear; 461 int ret = state->state & bits_to_clear;
464 462
465 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 463 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
466 u64 range = state->end - state->start + 1; 464 u64 range = state->end - state->start + 1;
467 WARN_ON(range > tree->dirty_bytes); 465 WARN_ON(range > tree->dirty_bytes);
468 tree->dirty_bytes -= range; 466 tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
471 state->state &= ~bits_to_clear; 469 state->state &= ~bits_to_clear;
472 if (wake) 470 if (wake)
473 wake_up(&state->wq); 471 wake_up(&state->wq);
474 if (delete || state->state == 0) { 472 if (state->state == 0) {
475 if (state->tree) { 473 if (state->tree) {
476 clear_state_cb(tree, state, state->state);
477 rb_erase(&state->rb_node, &tree->state); 474 rb_erase(&state->rb_node, &tree->state);
478 state->tree = NULL; 475 state->tree = NULL;
479 free_extent_state(state); 476 free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
514 int set = 0; 511 int set = 0;
515 int clear = 0; 512 int clear = 0;
516 513
514 if (delete)
515 bits |= ~EXTENT_CTLBITS;
516 bits |= EXTENT_FIRST_DELALLOC;
517
517 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
518 clear = 1; 519 clear = 1;
519again: 520again:
@@ -580,8 +581,7 @@ hit_next:
580 if (err) 581 if (err)
581 goto out; 582 goto out;
582 if (state->end <= end) { 583 if (state->end <= end) {
583 set |= clear_state_bit(tree, state, bits, wake, 584 set |= clear_state_bit(tree, state, &bits, wake);
584 delete);
585 if (last_end == (u64)-1) 585 if (last_end == (u64)-1)
586 goto out; 586 goto out;
587 start = last_end + 1; 587 start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
602 if (wake) 602 if (wake)
603 wake_up(&state->wq); 603 wake_up(&state->wq);
604 604
605 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 605 set |= clear_state_bit(tree, prealloc, &bits, wake);
606 606
607 prealloc = NULL; 607 prealloc = NULL;
608 goto out; 608 goto out;
@@ -613,7 +613,7 @@ hit_next:
613 else 613 else
614 next_node = NULL; 614 next_node = NULL;
615 615
616 set |= clear_state_bit(tree, state, bits, wake, delete); 616 set |= clear_state_bit(tree, state, &bits, wake);
617 if (last_end == (u64)-1) 617 if (last_end == (u64)-1)
618 goto out; 618 goto out;
619 start = last_end + 1; 619 start = last_end + 1;
@@ -706,19 +706,19 @@ out:
706 706
707static int set_state_bits(struct extent_io_tree *tree, 707static int set_state_bits(struct extent_io_tree *tree,
708 struct extent_state *state, 708 struct extent_state *state,
709 int bits) 709 int *bits)
710{ 710{
711 int ret; 711 int ret;
712 int bits_to_set = *bits & ~EXTENT_CTLBITS;
712 713
713 ret = set_state_cb(tree, state, bits); 714 ret = set_state_cb(tree, state, bits);
714 if (ret) 715 if (ret)
715 return ret; 716 return ret;
716 717 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
717 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
718 u64 range = state->end - state->start + 1; 718 u64 range = state->end - state->start + 1;
719 tree->dirty_bytes += range; 719 tree->dirty_bytes += range;
720 } 720 }
721 state->state |= bits; 721 state->state |= bits_to_set;
722 722
723 return 0; 723 return 0;
724} 724}
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
745 * [start, end] is inclusive This takes the tree lock. 745 * [start, end] is inclusive This takes the tree lock.
746 */ 746 */
747 747
748static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 748int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
749 int bits, int exclusive_bits, u64 *failed_start, 749 int bits, int exclusive_bits, u64 *failed_start,
750 struct extent_state **cached_state, 750 struct extent_state **cached_state, gfp_t mask)
751 gfp_t mask)
752{ 751{
753 struct extent_state *state; 752 struct extent_state *state;
754 struct extent_state *prealloc = NULL; 753 struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
757 u64 last_start; 756 u64 last_start;
758 u64 last_end; 757 u64 last_end;
759 758
759 bits |= EXTENT_FIRST_DELALLOC;
760again: 760again:
761 if (!prealloc && (mask & __GFP_WAIT)) { 761 if (!prealloc && (mask & __GFP_WAIT)) {
762 prealloc = alloc_extent_state(mask); 762 prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
778 */ 778 */
779 node = tree_search(tree, start); 779 node = tree_search(tree, start);
780 if (!node) { 780 if (!node) {
781 err = insert_state(tree, prealloc, start, end, bits); 781 err = insert_state(tree, prealloc, start, end, &bits);
782 prealloc = NULL; 782 prealloc = NULL;
783 BUG_ON(err == -EEXIST); 783 BUG_ON(err == -EEXIST);
784 goto out; 784 goto out;
@@ -802,7 +802,7 @@ hit_next:
802 goto out; 802 goto out;
803 } 803 }
804 804
805 err = set_state_bits(tree, state, bits); 805 err = set_state_bits(tree, state, &bits);
806 if (err) 806 if (err)
807 goto out; 807 goto out;
808 808
@@ -852,7 +852,7 @@ hit_next:
852 if (err) 852 if (err)
853 goto out; 853 goto out;
854 if (state->end <= end) { 854 if (state->end <= end) {
855 err = set_state_bits(tree, state, bits); 855 err = set_state_bits(tree, state, &bits);
856 if (err) 856 if (err)
857 goto out; 857 goto out;
858 cache_state(state, cached_state); 858 cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
877 else 877 else
878 this_end = last_start - 1; 878 this_end = last_start - 1;
879 err = insert_state(tree, prealloc, start, this_end, 879 err = insert_state(tree, prealloc, start, this_end,
880 bits); 880 &bits);
881 BUG_ON(err == -EEXIST); 881 BUG_ON(err == -EEXIST);
882 if (err) { 882 if (err) {
883 prealloc = NULL; 883 prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
903 err = split_state(tree, state, prealloc, end + 1); 903 err = split_state(tree, state, prealloc, end + 1);
904 BUG_ON(err == -EEXIST); 904 BUG_ON(err == -EEXIST);
905 905
906 err = set_state_bits(tree, prealloc, bits); 906 err = set_state_bits(tree, prealloc, &bits);
907 if (err) { 907 if (err) {
908 prealloc = NULL; 908 prealloc = NULL;
909 goto out; 909 goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
966{ 966{
967 return clear_extent_bit(tree, start, end, 967 return clear_extent_bit(tree, start, end,
968 EXTENT_DIRTY | EXTENT_DELALLOC | 968 EXTENT_DIRTY | EXTENT_DELALLOC |
969 EXTENT_DO_ACCOUNTING, 0, 0, 969 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
970 NULL, mask);
971} 970}
972 971
973int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 972int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1435 if (op & EXTENT_CLEAR_DELALLOC) 1434 if (op & EXTENT_CLEAR_DELALLOC)
1436 clear_bits |= EXTENT_DELALLOC; 1435 clear_bits |= EXTENT_DELALLOC;
1437 1436
1438 if (op & EXTENT_CLEAR_ACCOUNTING)
1439 clear_bits |= EXTENT_DO_ACCOUNTING;
1440
1441 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1437 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1442 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1438 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1439 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1916 1912
1917 if (tree->ops && tree->ops->submit_bio_hook) 1913 if (tree->ops && tree->ops->submit_bio_hook)
1918 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1914 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1919 mirror_num, bio_flags); 1915 mirror_num, bio_flags, start);
1920 else 1916 else
1921 submit_bio(rw, bio); 1917 submit_bio(rw, bio);
1922 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1918 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2020 sector_t sector; 2016 sector_t sector;
2021 struct extent_map *em; 2017 struct extent_map *em;
2022 struct block_device *bdev; 2018 struct block_device *bdev;
2019 struct btrfs_ordered_extent *ordered;
2023 int ret; 2020 int ret;
2024 int nr = 0; 2021 int nr = 0;
2025 size_t page_offset = 0; 2022 size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2031 set_page_extent_mapped(page); 2028 set_page_extent_mapped(page);
2032 2029
2033 end = page_end; 2030 end = page_end;
2034 lock_extent(tree, start, end, GFP_NOFS); 2031 while (1) {
2032 lock_extent(tree, start, end, GFP_NOFS);
2033 ordered = btrfs_lookup_ordered_extent(inode, start);
2034 if (!ordered)
2035 break;
2036 unlock_extent(tree, start, end, GFP_NOFS);
2037 btrfs_start_ordered_extent(inode, ordered, 1);
2038 btrfs_put_ordered_extent(ordered);
2039 }
2035 2040
2036 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2041 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2037 char *userpage; 2042 char *userpage;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
20 22
21/* flags for bio submission */ 23/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1 24#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
47 49
48typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 50typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
49 struct bio *bio, int mirror_num, 51 struct bio *bio, int mirror_num,
50 unsigned long bio_flags); 52 unsigned long bio_flags, u64 bio_offset);
51struct extent_io_ops { 53struct extent_io_ops {
52 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 54 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
53 u64 start, u64 end, int *page_started, 55 u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
69 struct extent_state *state); 71 struct extent_state *state);
70 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 72 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
71 struct extent_state *state, int uptodate); 73 struct extent_state *state, int uptodate);
72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
73 unsigned long old, unsigned long bits); 75 int *bits);
74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 76 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
75 unsigned long bits); 77 int *bits);
76 int (*merge_extent_hook)(struct inode *inode, 78 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new, 79 struct extent_state *new,
78 struct extent_state *other); 80 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
176 u64 *start, u64 search_end, 178 u64 *start, u64 search_end,
177 u64 max_bytes, unsigned long bits); 179 u64 max_bytes, unsigned long bits);
178 180
181void free_extent_state(struct extent_state *state);
179int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
180 int bits, int filled, struct extent_state *cached_state); 183 int bits, int filled, struct extent_state *cached_state);
181int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 184int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
185 gfp_t mask); 188 gfp_t mask);
186int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 189int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
187 int bits, gfp_t mask); 190 int bits, gfp_t mask);
191int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask);
188int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
189 gfp_t mask); 195 gfp_t mask);
190int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
149} 149}
150 150
151 151
152int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
153 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
154{ 155{
155 u32 sum; 156 u32 sum;
156 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
157 int bio_index = 0; 158 int bio_index = 0;
158 u64 offset; 159 u64 offset = 0;
159 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
160 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
161 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
174 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
175 176
176 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
177 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
178 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
179 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
180 if (ret == 0) 184 if (ret == 0)
181 goto found; 185 goto found;
@@ -238,6 +242,7 @@ found:
238 else 242 else
239 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
240 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
241 bio_index++; 246 bio_index++;
242 bvec++; 247 bvec++;
243 } 248 }
@@ -245,6 +250,18 @@ found:
245 return 0; 250 return 0;
246} 251}
247 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
248int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
249 struct list_head *list) 266 struct list_head *list)
250{ 267{
@@ -657,6 +674,9 @@ again:
657 goto found; 674 goto found;
658 } 675 }
659 ret = PTR_ERR(item); 676 ret = PTR_ERR(item);
677 if (ret != -EFBIG && ret != -ENOENT)
678 goto fail_unlock;
679
660 if (ret == -EFBIG) { 680 if (ret == -EFBIG) {
661 u32 item_size; 681 u32 item_size;
662 /* we found one, but it isn't big enough yet */ 682 /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
47 int write_bytes, 47 int write_bytes,
48 struct page **prepared_pages, 48 struct page **prepared_pages,
49 const char __user *buf) 49 struct iov_iter *i)
50{ 50{
51 long page_fault = 0; 51 size_t copied;
52 int i; 52 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 53 int offset = pos & (PAGE_CACHE_SIZE - 1);
54 54
55 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { 55 while (write_bytes > 0) {
56 size_t count = min_t(size_t, 56 size_t count = min_t(size_t,
57 PAGE_CACHE_SIZE - offset, write_bytes); 57 PAGE_CACHE_SIZE - offset, write_bytes);
58 struct page *page = prepared_pages[i]; 58 struct page *page = prepared_pages[pg];
59 fault_in_pages_readable(buf, count); 59again:
60 if (unlikely(iov_iter_fault_in_readable(i, count)))
61 return -EFAULT;
60 62
61 /* Copy data from userspace to the current page */ 63 /* Copy data from userspace to the current page */
62 kmap(page); 64 copied = iov_iter_copy_from_user(page, i, offset, count);
63 page_fault = __copy_from_user(page_address(page) + offset, 65
64 buf, count);
65 /* Flush processor's dcache for this page */ 66 /* Flush processor's dcache for this page */
66 flush_dcache_page(page); 67 flush_dcache_page(page);
67 kunmap(page); 68 iov_iter_advance(i, copied);
68 buf += count; 69 write_bytes -= copied;
69 write_bytes -= count;
70 70
71 if (page_fault) 71 if (unlikely(copied == 0)) {
72 break; 72 count = min_t(size_t, PAGE_CACHE_SIZE - offset,
73 iov_iter_single_seg_count(i));
74 goto again;
75 }
76
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 offset += copied;
79 } else {
80 pg++;
81 offset = 0;
82 }
73 } 83 }
74 return page_fault ? -EFAULT : 0; 84 return 0;
75} 85}
76 86
77/* 87/*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
126 end_of_last_block = start_pos + num_bytes - 1; 136 end_of_last_block = start_pos + num_bytes - 1;
127 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 137 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
128 NULL); 138 NULL);
129 if (err) 139 BUG_ON(err);
130 return err;
131 140
132 for (i = 0; i < num_pages; i++) { 141 for (i = 0; i < num_pages; i++) {
133 struct page *p = pages[i]; 142 struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
142 * at this time. 151 * at this time.
143 */ 152 */
144 } 153 }
145 return err; 154 return 0;
146} 155}
147 156
148/* 157/*
@@ -823,45 +832,46 @@ again:
823 return 0; 832 return 0;
824} 833}
825 834
826static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 835static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
827 size_t count, loff_t *ppos) 836 const struct iovec *iov,
837 unsigned long nr_segs, loff_t pos)
828{ 838{
829 loff_t pos; 839 struct file *file = iocb->ki_filp;
840 struct inode *inode = fdentry(file)->d_inode;
841 struct btrfs_root *root = BTRFS_I(inode)->root;
842 struct page *pinned[2];
843 struct page **pages = NULL;
844 struct iov_iter i;
845 loff_t *ppos = &iocb->ki_pos;
830 loff_t start_pos; 846 loff_t start_pos;
831 ssize_t num_written = 0; 847 ssize_t num_written = 0;
832 ssize_t err = 0; 848 ssize_t err = 0;
849 size_t count;
850 size_t ocount;
833 int ret = 0; 851 int ret = 0;
834 struct inode *inode = fdentry(file)->d_inode;
835 struct btrfs_root *root = BTRFS_I(inode)->root;
836 struct page **pages = NULL;
837 int nrptrs; 852 int nrptrs;
838 struct page *pinned[2];
839 unsigned long first_index; 853 unsigned long first_index;
840 unsigned long last_index; 854 unsigned long last_index;
841 int will_write; 855 int will_write;
856 int buffered = 0;
842 857
843 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
844 (file->f_flags & O_DIRECT)); 859 (file->f_flags & O_DIRECT));
845 860
846 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
847 PAGE_CACHE_SIZE / (sizeof(struct page *)));
848 pinned[0] = NULL; 861 pinned[0] = NULL;
849 pinned[1] = NULL; 862 pinned[1] = NULL;
850 863
851 pos = *ppos;
852 start_pos = pos; 864 start_pos = pos;
853 865
854 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 866 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
855 867
856 /* do the reserve before the mutex lock in case we have to do some
857 * flushing. We wouldn't deadlock, but this is more polite.
858 */
859 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
860 if (err)
861 goto out_nolock;
862
863 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
864 869
870 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
871 if (err)
872 goto out;
873 count = ocount;
874
865 current->backing_dev_info = inode->i_mapping->backing_dev_info; 875 current->backing_dev_info = inode->i_mapping->backing_dev_info;
866 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 876 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
867 if (err) 877 if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
875 goto out; 885 goto out;
876 886
877 file_update_time(file); 887 file_update_time(file);
888 BTRFS_I(inode)->sequence++;
889
890 if (unlikely(file->f_flags & O_DIRECT)) {
891 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 pos, ppos, count,
893 ocount);
894 /*
895 * the generic O_DIRECT will update in-memory i_size after the
896 * DIOs are done. But our endio handlers that update the on
897 * disk i_size never update past the in memory i_size. So we
898 * need one more update here to catch any additions to the
899 * file
900 */
901 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
902 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
903 mark_inode_dirty(inode);
904 }
878 905
906 if (num_written < 0) {
907 ret = num_written;
908 num_written = 0;
909 goto out;
910 } else if (num_written == count) {
911 /* pick up pos changes done by the generic code */
912 pos = *ppos;
913 goto out;
914 }
915 /*
916 * We are going to do buffered for the rest of the range, so we
917 * need to make sure to invalidate the buffered pages when we're
918 * done.
919 */
920 buffered = 1;
921 pos += num_written;
922 }
923
924 iov_iter_init(&i, iov, nr_segs, count, num_written);
925 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 (sizeof(struct page *)));
879 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 928 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
880 929
881 /* generic_write_checks can change our pos */ 930 /* generic_write_checks can change our pos */
882 start_pos = pos; 931 start_pos = pos;
883 932
884 BTRFS_I(inode)->sequence++;
885 first_index = pos >> PAGE_CACHE_SHIFT; 933 first_index = pos >> PAGE_CACHE_SHIFT;
886 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 934 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
887 935
888 /* 936 /*
889 * there are lots of better ways to do this, but this code 937 * there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
900 unlock_page(pinned[0]); 948 unlock_page(pinned[0]);
901 } 949 }
902 } 950 }
903 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { 951 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
904 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 952 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
905 if (!PageUptodate(pinned[1])) { 953 if (!PageUptodate(pinned[1])) {
906 ret = btrfs_readpage(NULL, pinned[1]); 954 ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
911 } 959 }
912 } 960 }
913 961
914 while (count > 0) { 962 while (iov_iter_count(&i) > 0) {
915 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 963 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
916 size_t write_bytes = min(count, nrptrs * 964 size_t write_bytes = min(iov_iter_count(&i),
917 (size_t)PAGE_CACHE_SIZE - 965 nrptrs * (size_t)PAGE_CACHE_SIZE -
918 offset); 966 offset);
919 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 967 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
920 PAGE_CACHE_SHIFT; 968 PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
922 WARN_ON(num_pages > nrptrs); 970 WARN_ON(num_pages > nrptrs);
923 memset(pages, 0, sizeof(struct page *) * nrptrs); 971 memset(pages, 0, sizeof(struct page *) * nrptrs);
924 972
925 ret = btrfs_check_data_free_space(root, inode, write_bytes); 973 ret = btrfs_delalloc_reserve_space(inode, write_bytes);
926 if (ret) 974 if (ret)
927 goto out; 975 goto out;
928 976
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
930 pos, first_index, last_index, 978 pos, first_index, last_index,
931 write_bytes); 979 write_bytes);
932 if (ret) { 980 if (ret) {
933 btrfs_free_reserved_data_space(root, inode, 981 btrfs_delalloc_release_space(inode, write_bytes);
934 write_bytes);
935 goto out; 982 goto out;
936 } 983 }
937 984
938 ret = btrfs_copy_from_user(pos, num_pages, 985 ret = btrfs_copy_from_user(pos, num_pages,
939 write_bytes, pages, buf); 986 write_bytes, pages, &i);
940 if (ret) { 987 if (ret == 0) {
941 btrfs_free_reserved_data_space(root, inode, 988 dirty_and_release_pages(NULL, root, file, pages,
942 write_bytes); 989 num_pages, pos, write_bytes);
943 btrfs_drop_pages(pages, num_pages);
944 goto out;
945 } 990 }
946 991
947 ret = dirty_and_release_pages(NULL, root, file, pages,
948 num_pages, pos, write_bytes);
949 btrfs_drop_pages(pages, num_pages); 992 btrfs_drop_pages(pages, num_pages);
950 if (ret) { 993 if (ret) {
951 btrfs_free_reserved_data_space(root, inode, 994 btrfs_delalloc_release_space(inode, write_bytes);
952 write_bytes);
953 goto out; 995 goto out;
954 } 996 }
955 997
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
965 btrfs_throttle(root); 1007 btrfs_throttle(root);
966 } 1008 }
967 1009
968 buf += write_bytes;
969 count -= write_bytes;
970 pos += write_bytes; 1010 pos += write_bytes;
971 num_written += write_bytes; 1011 num_written += write_bytes;
972 1012
@@ -976,9 +1016,7 @@ out:
976 mutex_unlock(&inode->i_mutex); 1016 mutex_unlock(&inode->i_mutex);
977 if (ret) 1017 if (ret)
978 err = ret; 1018 err = ret;
979 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
980 1019
981out_nolock:
982 kfree(pages); 1020 kfree(pages);
983 if (pinned[0]) 1021 if (pinned[0])
984 page_cache_release(pinned[0]); 1022 page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
1008 num_written = err; 1046 num_written = err;
1009 1047
1010 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1011 trans = btrfs_start_transaction(root, 1); 1049 trans = btrfs_start_transaction(root, 0);
1012 ret = btrfs_log_dentry_safe(trans, root, 1050 ret = btrfs_log_dentry_safe(trans, root,
1013 file->f_dentry); 1051 file->f_dentry);
1014 if (ret == 0) { 1052 if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
1023 btrfs_end_transaction(trans, root); 1061 btrfs_end_transaction(trans, root);
1024 } 1062 }
1025 } 1063 }
1026 if (file->f_flags & O_DIRECT) { 1064 if (file->f_flags & O_DIRECT && buffered) {
1027 invalidate_mapping_pages(inode->i_mapping, 1065 invalidate_mapping_pages(inode->i_mapping,
1028 start_pos >> PAGE_CACHE_SHIFT, 1066 start_pos >> PAGE_CACHE_SHIFT,
1029 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1063 * important optimization for directories because holding the mutex prevents 1101 * important optimization for directories because holding the mutex prevents
1064 * new operations on the dir while we write to disk. 1102 * new operations on the dir while we write to disk.
1065 */ 1103 */
1066int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 1104int btrfs_sync_file(struct file *file, int datasync)
1067{ 1105{
1106 struct dentry *dentry = file->f_path.dentry;
1068 struct inode *inode = dentry->d_inode; 1107 struct inode *inode = dentry->d_inode;
1069 struct btrfs_root *root = BTRFS_I(inode)->root; 1108 struct btrfs_root *root = BTRFS_I(inode)->root;
1070 int ret = 0; 1109 int ret = 0;
@@ -1101,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1101 /* 1140 /*
1102 * ok we haven't committed the transaction yet, lets do a commit 1141 * ok we haven't committed the transaction yet, lets do a commit
1103 */ 1142 */
1104 if (file && file->private_data) 1143 if (file->private_data)
1105 btrfs_ioctl_trans_end(file); 1144 btrfs_ioctl_trans_end(file);
1106 1145
1107 trans = btrfs_start_transaction(root, 1); 1146 trans = btrfs_start_transaction(root, 0);
1108 if (!trans) { 1147 if (IS_ERR(trans)) {
1109 ret = -ENOMEM; 1148 ret = PTR_ERR(trans);
1110 goto out; 1149 goto out;
1111 } 1150 }
1112 1151
@@ -1151,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
1151 1190
1152static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1191static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1153{ 1192{
1154 vma->vm_ops = &btrfs_file_vm_ops; 1193 struct address_space *mapping = filp->f_mapping;
1194
1195 if (!mapping->a_ops->readpage)
1196 return -ENOEXEC;
1197
1155 file_accessed(filp); 1198 file_accessed(filp);
1199 vma->vm_ops = &btrfs_file_vm_ops;
1200 vma->vm_flags |= VM_CAN_NONLINEAR;
1201
1156 return 0; 1202 return 0;
1157} 1203}
1158 1204
1159const struct file_operations btrfs_file_operations = { 1205const struct file_operations btrfs_file_operations = {
1160 .llseek = generic_file_llseek, 1206 .llseek = generic_file_llseek,
1161 .read = do_sync_read, 1207 .read = do_sync_read,
1208 .write = do_sync_write,
1162 .aio_read = generic_file_aio_read, 1209 .aio_read = generic_file_aio_read,
1163 .splice_read = generic_file_splice_read, 1210 .splice_read = generic_file_splice_read,
1164 .write = btrfs_file_write, 1211 .aio_write = btrfs_file_aio_write,
1165 .mmap = btrfs_file_mmap, 1212 .mmap = btrfs_file_mmap,
1166 .open = generic_file_open, 1213 .open = generic_file_open,
1167 .release = btrfs_release_file, 1214 .release = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
49 return 0; 49 return 0;
50} 50}
51 51
52struct btrfs_inode_ref *
53btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path,
56 const char *name, int name_len,
57 u64 inode_objectid, u64 ref_objectid, int mod)
58{
59 struct btrfs_key key;
60 struct btrfs_inode_ref *ref;
61 int ins_len = mod < 0 ? -1 : 0;
62 int cow = mod != 0;
63 int ret;
64
65 key.objectid = inode_objectid;
66 key.type = BTRFS_INODE_REF_KEY;
67 key.offset = ref_objectid;
68
69 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0)
73 return NULL;
74 if (!find_name_in_backref(path, name, name_len, &ref))
75 return NULL;
76 return ref;
77}
78
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 79int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root, 80 struct btrfs_root *root,
54 const char *name, int name_len, 81 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d601629b85d1..1bff92ad4744 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
252 inline_len, compressed_size, 252 inline_len, compressed_size,
253 compressed_pages); 253 compressed_pages);
254 BUG_ON(ret); 254 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start);
255 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
256 return 0; 257 return 0;
257} 258}
@@ -414,6 +415,7 @@ again:
414 trans = btrfs_join_transaction(root, 1); 415 trans = btrfs_join_transaction(root, 1);
415 BUG_ON(!trans); 416 BUG_ON(!trans);
416 btrfs_set_trans_block_group(trans, inode); 417 btrfs_set_trans_block_group(trans, inode);
418 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
417 419
418 /* lets try to make an inline extent */ 420 /* lets try to make an inline extent */
419 if (ret || total_in < (actual_end - start)) { 421 if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
439 start, end, NULL, 441 start, end, NULL,
440 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 442 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
441 EXTENT_CLEAR_DELALLOC | 443 EXTENT_CLEAR_DELALLOC |
442 EXTENT_CLEAR_ACCOUNTING |
443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
444 445
445 btrfs_end_transaction(trans, root); 446 btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
697 return 0; 698 return 0;
698} 699}
699 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
700/* 733/*
701 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
702 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
734 trans = btrfs_join_transaction(root, 1); 767 trans = btrfs_join_transaction(root, 1);
735 BUG_ON(!trans); 768 BUG_ON(!trans);
736 btrfs_set_trans_block_group(trans, inode); 769 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
737 771
738 actual_end = min_t(u64, isize, end + 1); 772 actual_end = min_t(u64, isize, end + 1);
739 773
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
753 EXTENT_CLEAR_UNLOCK_PAGE | 787 EXTENT_CLEAR_UNLOCK_PAGE |
754 EXTENT_CLEAR_UNLOCK | 788 EXTENT_CLEAR_UNLOCK |
755 EXTENT_CLEAR_DELALLOC | 789 EXTENT_CLEAR_DELALLOC |
756 EXTENT_CLEAR_ACCOUNTING |
757 EXTENT_CLEAR_DIRTY | 790 EXTENT_CLEAR_DIRTY |
758 EXTENT_SET_WRITEBACK | 791 EXTENT_SET_WRITEBACK |
759 EXTENT_END_WRITEBACK); 792 EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
769 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
770 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
771 804
772 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
773 read_lock(&BTRFS_I(inode)->extent_tree.lock);
774 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
775 start, num_bytes);
776 if (em) {
777 /*
778 * if block start isn't an actual block number then find the
779 * first block in this inode and use that as a hint. If that
780 * block is also bogus then just don't worry about it.
781 */
782 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
783 free_extent_map(em);
784 em = search_extent_mapping(em_tree, 0, 0);
785 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
786 alloc_hint = em->block_start;
787 if (em)
788 free_extent_map(em);
789 } else {
790 alloc_hint = em->block_start;
791 free_extent_map(em);
792 }
793 }
794 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
795 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
796 807
797 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
1174 num_bytes, num_bytes, type); 1185 num_bytes, num_bytes, type);
1175 BUG_ON(ret); 1186 BUG_ON(ret);
1176 1187
1188 if (root->root_key.objectid ==
1189 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1190 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1191 num_bytes);
1192 BUG_ON(ret);
1193 }
1194
1177 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1195 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1178 cur_offset, cur_offset + num_bytes - 1, 1196 cur_offset, cur_offset + num_bytes - 1,
1179 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1197 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1226} 1244}
1227 1245
1228static int btrfs_split_extent_hook(struct inode *inode, 1246static int btrfs_split_extent_hook(struct inode *inode,
1229 struct extent_state *orig, u64 split) 1247 struct extent_state *orig, u64 split)
1230{ 1248{
1249 /* not delalloc, ignore it */
1231 if (!(orig->state & EXTENT_DELALLOC)) 1250 if (!(orig->state & EXTENT_DELALLOC))
1232 return 0; 1251 return 0;
1233 1252
1234 spin_lock(&BTRFS_I(inode)->accounting_lock); 1253 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1235 BTRFS_I(inode)->outstanding_extents++;
1236 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1237
1238 return 0; 1254 return 0;
1239} 1255}
1240 1256
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1252 if (!(other->state & EXTENT_DELALLOC)) 1268 if (!(other->state & EXTENT_DELALLOC))
1253 return 0; 1269 return 0;
1254 1270
1255 spin_lock(&BTRFS_I(inode)->accounting_lock); 1271 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1256 BTRFS_I(inode)->outstanding_extents--;
1257 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1258
1259 return 0; 1272 return 0;
1260} 1273}
1261 1274
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1264 * bytes in this file, and to maintain the list of inodes that 1277 * bytes in this file, and to maintain the list of inodes that
1265 * have pending delalloc work to be done. 1278 * have pending delalloc work to be done.
1266 */ 1279 */
1267static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1280static int btrfs_set_bit_hook(struct inode *inode,
1268 unsigned long old, unsigned long bits) 1281 struct extent_state *state, int *bits)
1269{ 1282{
1270 1283
1271 /* 1284 /*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1273 * but in this case, we are only testeing for the DELALLOC 1286 * but in this case, we are only testeing for the DELALLOC
1274 * bit, which is only set or cleared with irqs on 1287 * bit, which is only set or cleared with irqs on
1275 */ 1288 */
1276 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1277 struct btrfs_root *root = BTRFS_I(inode)->root; 1290 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start;
1278 1292
1279 spin_lock(&BTRFS_I(inode)->accounting_lock); 1293 if (*bits & EXTENT_FIRST_DELALLOC)
1280 BTRFS_I(inode)->outstanding_extents++; 1294 *bits &= ~EXTENT_FIRST_DELALLOC;
1281 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1295 else
1282 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1296 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1283 1297
1284 spin_lock(&root->fs_info->delalloc_lock); 1298 spin_lock(&root->fs_info->delalloc_lock);
1285 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1299 BTRFS_I(inode)->delalloc_bytes += len;
1286 root->fs_info->delalloc_bytes += end - start + 1; 1300 root->fs_info->delalloc_bytes += len;
1287 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1288 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1289 &root->fs_info->delalloc_inodes); 1303 &root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1297 * extent_io.c clear_bit_hook, see set_bit_hook for why 1311 * extent_io.c clear_bit_hook, see set_bit_hook for why
1298 */ 1312 */
1299static int btrfs_clear_bit_hook(struct inode *inode, 1313static int btrfs_clear_bit_hook(struct inode *inode,
1300 struct extent_state *state, unsigned long bits) 1314 struct extent_state *state, int *bits)
1301{ 1315{
1302 /* 1316 /*
1303 * set_bit and clear bit hooks normally require _irqsave/restore 1317 * set_bit and clear bit hooks normally require _irqsave/restore
1304 * but in this case, we are only testeing for the DELALLOC 1318 * but in this case, we are only testeing for the DELALLOC
1305 * bit, which is only set or cleared with irqs on 1319 * bit, which is only set or cleared with irqs on
1306 */ 1320 */
1307 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1308 struct btrfs_root *root = BTRFS_I(inode)->root; 1322 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start;
1309 1324
1310 if (bits & EXTENT_DO_ACCOUNTING) { 1325 if (*bits & EXTENT_FIRST_DELALLOC)
1311 spin_lock(&BTRFS_I(inode)->accounting_lock); 1326 *bits &= ~EXTENT_FIRST_DELALLOC;
1312 WARN_ON(!BTRFS_I(inode)->outstanding_extents); 1327 else if (!(*bits & EXTENT_DO_ACCOUNTING))
1313 BTRFS_I(inode)->outstanding_extents--; 1328 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1314 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1329
1315 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1330 if (*bits & EXTENT_DO_ACCOUNTING)
1316 } 1331 btrfs_delalloc_release_metadata(inode, len);
1332
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1334 btrfs_free_reserved_data_space(inode, len);
1317 1335
1318 spin_lock(&root->fs_info->delalloc_lock); 1336 spin_lock(&root->fs_info->delalloc_lock);
1319 if (state->end - state->start + 1 > 1337 root->fs_info->delalloc_bytes -= len;
1320 root->fs_info->delalloc_bytes) { 1338 BTRFS_I(inode)->delalloc_bytes -= len;
1321 printk(KERN_INFO "btrfs warning: delalloc account " 1339
1322 "%llu %llu\n",
1323 (unsigned long long)
1324 state->end - state->start + 1,
1325 (unsigned long long)
1326 root->fs_info->delalloc_bytes);
1327 btrfs_delalloc_free_space(root, inode, (u64)-1);
1328 root->fs_info->delalloc_bytes = 0;
1329 BTRFS_I(inode)->delalloc_bytes = 0;
1330 } else {
1331 btrfs_delalloc_free_space(root, inode,
1332 state->end -
1333 state->start + 1);
1334 root->fs_info->delalloc_bytes -= state->end -
1335 state->start + 1;
1336 BTRFS_I(inode)->delalloc_bytes -= state->end -
1337 state->start + 1;
1338 }
1339 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1340 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1340 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1341 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1384 */ 1385 */
1385static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1386static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1386 struct bio *bio, int mirror_num, 1387 struct bio *bio, int mirror_num,
1387 unsigned long bio_flags) 1388 unsigned long bio_flags,
1389 u64 bio_offset)
1388{ 1390{
1389 struct btrfs_root *root = BTRFS_I(inode)->root; 1391 struct btrfs_root *root = BTRFS_I(inode)->root;
1390 int ret = 0; 1392 int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1403 * are inserted into the btree 1405 * are inserted into the btree
1404 */ 1406 */
1405static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1407static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1406 int mirror_num, unsigned long bio_flags) 1408 int mirror_num, unsigned long bio_flags,
1409 u64 bio_offset)
1407{ 1410{
1408 struct btrfs_root *root = BTRFS_I(inode)->root; 1411 struct btrfs_root *root = BTRFS_I(inode)->root;
1409 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1412 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1414 * on write, or reading the csums from the tree before a read 1417 * on write, or reading the csums from the tree before a read
1415 */ 1418 */
1416static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1419static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1417 int mirror_num, unsigned long bio_flags) 1420 int mirror_num, unsigned long bio_flags,
1421 u64 bio_offset)
1418{ 1422{
1419 struct btrfs_root *root = BTRFS_I(inode)->root; 1423 struct btrfs_root *root = BTRFS_I(inode)->root;
1420 int ret = 0; 1424 int ret = 0;
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1439 /* we're doing a write, do the async checksumming */ 1443 /* we're doing a write, do the async checksumming */
1440 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1444 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1441 inode, rw, bio, mirror_num, 1445 inode, rw, bio, mirror_num,
1442 bio_flags, __btrfs_submit_bio_start, 1446 bio_flags, bio_offset,
1447 __btrfs_submit_bio_start,
1443 __btrfs_submit_bio_done); 1448 __btrfs_submit_bio_done);
1444 } 1449 }
1445 1450
@@ -1520,6 +1525,7 @@ again:
1520 goto again; 1525 goto again;
1521 } 1526 }
1522 1527
1528 BUG();
1523 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1529 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1524 ClearPageChecked(page); 1530 ClearPageChecked(page);
1525out: 1531out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1650static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1656static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1651{ 1657{
1652 struct btrfs_root *root = BTRFS_I(inode)->root; 1658 struct btrfs_root *root = BTRFS_I(inode)->root;
1653 struct btrfs_trans_handle *trans; 1659 struct btrfs_trans_handle *trans = NULL;
1654 struct btrfs_ordered_extent *ordered_extent = NULL; 1660 struct btrfs_ordered_extent *ordered_extent = NULL;
1655 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1661 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1656 struct extent_state *cached_state = NULL; 1662 struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1668 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1669 if (!ret) { 1675 if (!ret) {
1670 trans = btrfs_join_transaction(root, 1); 1676 trans = btrfs_join_transaction(root, 1);
1677 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1671 ret = btrfs_update_inode(trans, root, inode); 1679 ret = btrfs_update_inode(trans, root, inode);
1672 BUG_ON(ret); 1680 BUG_ON(ret);
1673 btrfs_end_transaction(trans, root);
1674 } 1681 }
1675 goto out; 1682 goto out;
1676 } 1683 }
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1680 0, &cached_state, GFP_NOFS); 1687 0, &cached_state, GFP_NOFS);
1681 1688
1682 trans = btrfs_join_transaction(root, 1); 1689 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1683 1692
1684 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1693 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1685 compressed = 1; 1694 compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1711 add_pending_csums(trans, inode, ordered_extent->file_offset, 1720 add_pending_csums(trans, inode, ordered_extent->file_offset,
1712 &ordered_extent->list); 1721 &ordered_extent->list);
1713 1722
1714 /* this also removes the ordered extent from the tree */
1715 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1723 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1716 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
1717 BUG_ON(ret); 1725 BUG_ON(ret);
1718 btrfs_end_transaction(trans, root);
1719out: 1726out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1728 if (trans)
1729 btrfs_end_transaction(trans, root);
1720 /* once for us */ 1730 /* once for us */
1721 btrfs_put_ordered_extent(ordered_extent); 1731 btrfs_put_ordered_extent(ordered_extent);
1722 /* once for the tree */ 1732 /* once for the tree */
@@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1838 1848
1839 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1849 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1840 failrec->last_mirror, 1850 failrec->last_mirror,
1841 failrec->bio_flags); 1851 failrec->bio_flags, 0);
1842 return 0; 1852 return 0;
1843} 1853}
1844 1854
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
1993} 2003}
1994 2004
1995/* 2005/*
2006 * calculate extra metadata reservation when snapshotting a subvolume
2007 * contains orphan files.
2008 */
2009void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2010 struct btrfs_pending_snapshot *pending,
2011 u64 *bytes_to_reserve)
2012{
2013 struct btrfs_root *root;
2014 struct btrfs_block_rsv *block_rsv;
2015 u64 num_bytes;
2016 int index;
2017
2018 root = pending->root;
2019 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2020 return;
2021
2022 block_rsv = root->orphan_block_rsv;
2023
2024 /* orphan block reservation for the snapshot */
2025 num_bytes = block_rsv->size;
2026
2027 /*
2028 * after the snapshot is created, COWing tree blocks may use more
2029 * space than it frees. So we should make sure there is enough
2030 * reserved space.
2031 */
2032 index = trans->transid & 0x1;
2033 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2034 num_bytes += block_rsv->size -
2035 (block_rsv->reserved + block_rsv->freed[index]);
2036 }
2037
2038 *bytes_to_reserve += num_bytes;
2039}
2040
2041void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2042 struct btrfs_pending_snapshot *pending)
2043{
2044 struct btrfs_root *root = pending->root;
2045 struct btrfs_root *snap = pending->snap;
2046 struct btrfs_block_rsv *block_rsv;
2047 u64 num_bytes;
2048 int index;
2049 int ret;
2050
2051 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2052 return;
2053
2054 /* refill source subvolume's orphan block reservation */
2055 block_rsv = root->orphan_block_rsv;
2056 index = trans->transid & 0x1;
2057 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2058 num_bytes = block_rsv->size -
2059 (block_rsv->reserved + block_rsv->freed[index]);
2060 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2061 root->orphan_block_rsv,
2062 num_bytes);
2063 BUG_ON(ret);
2064 }
2065
2066 /* setup orphan block reservation for the snapshot */
2067 block_rsv = btrfs_alloc_block_rsv(snap);
2068 BUG_ON(!block_rsv);
2069
2070 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2071 snap->orphan_block_rsv = block_rsv;
2072
2073 num_bytes = root->orphan_block_rsv->size;
2074 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2075 block_rsv, num_bytes);
2076 BUG_ON(ret);
2077
2078#if 0
2079 /* insert orphan item for the snapshot */
2080 WARN_ON(!root->orphan_item_inserted);
2081 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2082 snap->root_key.objectid);
2083 BUG_ON(ret);
2084 snap->orphan_item_inserted = 1;
2085#endif
2086}
2087
2088enum btrfs_orphan_cleanup_state {
2089 ORPHAN_CLEANUP_STARTED = 1,
2090 ORPHAN_CLEANUP_DONE = 2,
2091};
2092
2093/*
2094 * This is called in transaction commmit time. If there are no orphan
2095 * files in the subvolume, it removes orphan item and frees block_rsv
2096 * structure.
2097 */
2098void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root)
2100{
2101 int ret;
2102
2103 if (!list_empty(&root->orphan_list) ||
2104 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2105 return;
2106
2107 if (root->orphan_item_inserted &&
2108 btrfs_root_refs(&root->root_item) > 0) {
2109 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2110 root->root_key.objectid);
2111 BUG_ON(ret);
2112 root->orphan_item_inserted = 0;
2113 }
2114
2115 if (root->orphan_block_rsv) {
2116 WARN_ON(root->orphan_block_rsv->size > 0);
2117 btrfs_free_block_rsv(root, root->orphan_block_rsv);
2118 root->orphan_block_rsv = NULL;
2119 }
2120}
2121
2122/*
1996 * This creates an orphan entry for the given inode in case something goes 2123 * This creates an orphan entry for the given inode in case something goes
1997 * wrong in the middle of an unlink/truncate. 2124 * wrong in the middle of an unlink/truncate.
2125 *
2126 * NOTE: caller of this function should reserve 5 units of metadata for
2127 * this function.
1998 */ 2128 */
1999int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2129int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2000{ 2130{
2001 struct btrfs_root *root = BTRFS_I(inode)->root; 2131 struct btrfs_root *root = BTRFS_I(inode)->root;
2002 int ret = 0; 2132 struct btrfs_block_rsv *block_rsv = NULL;
2133 int reserve = 0;
2134 int insert = 0;
2135 int ret;
2003 2136
2004 spin_lock(&root->list_lock); 2137 if (!root->orphan_block_rsv) {
2138 block_rsv = btrfs_alloc_block_rsv(root);
2139 BUG_ON(!block_rsv);
2140 }
2005 2141
2006 /* already on the orphan list, we're good */ 2142 spin_lock(&root->orphan_lock);
2007 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2143 if (!root->orphan_block_rsv) {
2008 spin_unlock(&root->list_lock); 2144 root->orphan_block_rsv = block_rsv;
2009 return 0; 2145 } else if (block_rsv) {
2146 btrfs_free_block_rsv(root, block_rsv);
2147 block_rsv = NULL;
2148 }
2149
2150 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2151 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2152#if 0
2153 /*
2154 * For proper ENOSPC handling, we should do orphan
2155 * cleanup when mounting. But this introduces backward
2156 * compatibility issue.
2157 */
2158 if (!xchg(&root->orphan_item_inserted, 1))
2159 insert = 2;
2160 else
2161 insert = 1;
2162#endif
2163 insert = 1;
2164 } else {
2165 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2010 } 2166 }
2011 2167
2012 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2168 if (!BTRFS_I(inode)->orphan_meta_reserved) {
2169 BTRFS_I(inode)->orphan_meta_reserved = 1;
2170 reserve = 1;
2171 }
2172 spin_unlock(&root->orphan_lock);
2013 2173
2014 spin_unlock(&root->list_lock); 2174 if (block_rsv)
2175 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2015 2176
2016 /* 2177 /* grab metadata reservation from transaction handle */
2017 * insert an orphan item to track this unlinked/truncated file 2178 if (reserve) {
2018 */ 2179 ret = btrfs_orphan_reserve_metadata(trans, inode);
2019 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2180 BUG_ON(ret);
2181 }
2020 2182
2021 return ret; 2183 /* insert an orphan item to track this unlinked/truncated file */
2184 if (insert >= 1) {
2185 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2186 BUG_ON(ret);
2187 }
2188
2189 /* insert an orphan item to track subvolume contains orphan files */
2190 if (insert >= 2) {
2191 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2192 root->root_key.objectid);
2193 BUG_ON(ret);
2194 }
2195 return 0;
2022} 2196}
2023 2197
2024/* 2198/*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2028int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2202int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2029{ 2203{
2030 struct btrfs_root *root = BTRFS_I(inode)->root; 2204 struct btrfs_root *root = BTRFS_I(inode)->root;
2205 int delete_item = 0;
2206 int release_rsv = 0;
2031 int ret = 0; 2207 int ret = 0;
2032 2208
2033 spin_lock(&root->list_lock); 2209 spin_lock(&root->orphan_lock);
2034 2210 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2035 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2211 list_del_init(&BTRFS_I(inode)->i_orphan);
2036 spin_unlock(&root->list_lock); 2212 delete_item = 1;
2037 return 0;
2038 } 2213 }
2039 2214
2040 list_del_init(&BTRFS_I(inode)->i_orphan); 2215 if (BTRFS_I(inode)->orphan_meta_reserved) {
2041 if (!trans) { 2216 BTRFS_I(inode)->orphan_meta_reserved = 0;
2042 spin_unlock(&root->list_lock); 2217 release_rsv = 1;
2043 return 0;
2044 } 2218 }
2219 spin_unlock(&root->orphan_lock);
2045 2220
2046 spin_unlock(&root->list_lock); 2221 if (trans && delete_item) {
2222 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2223 BUG_ON(ret);
2224 }
2047 2225
2048 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2226 if (release_rsv)
2227 btrfs_orphan_release_metadata(inode);
2049 2228
2050 return ret; 2229 return 0;
2051} 2230}
2052 2231
2053/* 2232/*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2064 struct inode *inode; 2243 struct inode *inode;
2065 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2244 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2066 2245
2067 if (!xchg(&root->clean_orphans, 0)) 2246 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2068 return; 2247 return;
2069 2248
2070 path = btrfs_alloc_path(); 2249 path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2117 found_key.type = BTRFS_INODE_ITEM_KEY; 2296 found_key.type = BTRFS_INODE_ITEM_KEY;
2118 found_key.offset = 0; 2297 found_key.offset = 0;
2119 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2120 if (IS_ERR(inode)) 2299 BUG_ON(IS_ERR(inode));
2121 break;
2122 2300
2123 /* 2301 /*
2124 * add this inode to the orphan list so btrfs_orphan_del does 2302 * add this inode to the orphan list so btrfs_orphan_del does
2125 * the proper thing when we hit it 2303 * the proper thing when we hit it
2126 */ 2304 */
2127 spin_lock(&root->list_lock); 2305 spin_lock(&root->orphan_lock);
2128 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2306 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2129 spin_unlock(&root->list_lock); 2307 spin_unlock(&root->orphan_lock);
2130 2308
2131 /* 2309 /*
2132 * if this is a bad inode, means we actually succeeded in 2310 * if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2135 * do a destroy_inode 2313 * do a destroy_inode
2136 */ 2314 */
2137 if (is_bad_inode(inode)) { 2315 if (is_bad_inode(inode)) {
2138 trans = btrfs_start_transaction(root, 1); 2316 trans = btrfs_start_transaction(root, 0);
2139 btrfs_orphan_del(trans, inode); 2317 btrfs_orphan_del(trans, inode);
2140 btrfs_end_transaction(trans, root); 2318 btrfs_end_transaction(trans, root);
2141 iput(inode); 2319 iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2153 /* this will do delete_inode and everything for us */ 2331 /* this will do delete_inode and everything for us */
2154 iput(inode); 2332 iput(inode);
2155 } 2333 }
2334 btrfs_free_path(path);
2335
2336 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2337
2338 if (root->orphan_block_rsv)
2339 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2340 (u64)-1);
2341
2342 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2343 trans = btrfs_join_transaction(root, 1);
2344 btrfs_end_transaction(trans, root);
2345 }
2156 2346
2157 if (nr_unlink) 2347 if (nr_unlink)
2158 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2348 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2159 if (nr_truncate) 2349 if (nr_truncate)
2160 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2350 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2161
2162 btrfs_free_path(path);
2163} 2351}
2164 2352
2165/* 2353/*
@@ -2478,29 +2666,201 @@ out:
2478 return ret; 2666 return ret;
2479} 2667}
2480 2668
2481static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2669/* helper to check if there is any shared block in the path */
2670static int check_path_shared(struct btrfs_root *root,
2671 struct btrfs_path *path)
2672{
2673 struct extent_buffer *eb;
2674 int level;
2675 int ret;
2676 u64 refs = 1;
2677
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level])
2680 break;
2681 eb = path->nodes[level];
2682 if (!btrfs_block_can_be_shared(root, eb))
2683 continue;
2684 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2685 &refs, NULL);
2686 if (refs > 1)
2687 return 1;
2688 }
2689 return 0;
2690}
2691
2692/*
2693 * helper to start transaction for unlink and rmdir.
2694 *
2695 * unlink and rmdir are special in btrfs, they do not always free space.
2696 * so in enospc case, we should make sure they will free space before
2697 * allowing them to use the global metadata reservation.
2698 */
2699static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2700 struct dentry *dentry)
2482{ 2701{
2483 struct btrfs_root *root;
2484 struct btrfs_trans_handle *trans; 2702 struct btrfs_trans_handle *trans;
2703 struct btrfs_root *root = BTRFS_I(dir)->root;
2704 struct btrfs_path *path;
2705 struct btrfs_inode_ref *ref;
2706 struct btrfs_dir_item *di;
2485 struct inode *inode = dentry->d_inode; 2707 struct inode *inode = dentry->d_inode;
2708 u64 index;
2709 int check_link = 1;
2710 int err = -ENOSPC;
2486 int ret; 2711 int ret;
2487 unsigned long nr = 0;
2488 2712
2489 root = BTRFS_I(dir)->root; 2713 trans = btrfs_start_transaction(root, 10);
2714 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2715 return trans;
2490 2716
2491 /* 2717 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2492 * 5 items for unlink inode 2718 return ERR_PTR(-ENOSPC);
2493 * 1 for orphan 2719
2494 */ 2720 /* check if there is someone else holds reference */
2495 ret = btrfs_reserve_metadata_space(root, 6); 2721 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2496 if (ret) 2722 return ERR_PTR(-ENOSPC);
2497 return ret; 2723
2724 if (atomic_read(&inode->i_count) > 2)
2725 return ERR_PTR(-ENOSPC);
2726
2727 if (xchg(&root->fs_info->enospc_unlink, 1))
2728 return ERR_PTR(-ENOSPC);
2498 2729
2499 trans = btrfs_start_transaction(root, 1); 2730 path = btrfs_alloc_path();
2731 if (!path) {
2732 root->fs_info->enospc_unlink = 0;
2733 return ERR_PTR(-ENOMEM);
2734 }
2735
2736 trans = btrfs_start_transaction(root, 0);
2500 if (IS_ERR(trans)) { 2737 if (IS_ERR(trans)) {
2501 btrfs_unreserve_metadata_space(root, 6); 2738 btrfs_free_path(path);
2502 return PTR_ERR(trans); 2739 root->fs_info->enospc_unlink = 0;
2740 return trans;
2741 }
2742
2743 path->skip_locking = 1;
2744 path->search_commit_root = 1;
2745
2746 ret = btrfs_lookup_inode(trans, root, path,
2747 &BTRFS_I(dir)->location, 0);
2748 if (ret < 0) {
2749 err = ret;
2750 goto out;
2751 }
2752 if (ret == 0) {
2753 if (check_path_shared(root, path))
2754 goto out;
2755 } else {
2756 check_link = 0;
2757 }
2758 btrfs_release_path(root, path);
2759
2760 ret = btrfs_lookup_inode(trans, root, path,
2761 &BTRFS_I(inode)->location, 0);
2762 if (ret < 0) {
2763 err = ret;
2764 goto out;
2765 }
2766 if (ret == 0) {
2767 if (check_path_shared(root, path))
2768 goto out;
2769 } else {
2770 check_link = 0;
2771 }
2772 btrfs_release_path(root, path);
2773
2774 if (ret == 0 && S_ISREG(inode->i_mode)) {
2775 ret = btrfs_lookup_file_extent(trans, root, path,
2776 inode->i_ino, (u64)-1, 0);
2777 if (ret < 0) {
2778 err = ret;
2779 goto out;
2780 }
2781 BUG_ON(ret == 0);
2782 if (check_path_shared(root, path))
2783 goto out;
2784 btrfs_release_path(root, path);
2785 }
2786
2787 if (!check_link) {
2788 err = 0;
2789 goto out;
2790 }
2791
2792 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2793 dentry->d_name.name, dentry->d_name.len, 0);
2794 if (IS_ERR(di)) {
2795 err = PTR_ERR(di);
2796 goto out;
2797 }
2798 if (di) {
2799 if (check_path_shared(root, path))
2800 goto out;
2801 } else {
2802 err = 0;
2803 goto out;
2503 } 2804 }
2805 btrfs_release_path(root, path);
2806
2807 ref = btrfs_lookup_inode_ref(trans, root, path,
2808 dentry->d_name.name, dentry->d_name.len,
2809 inode->i_ino, dir->i_ino, 0);
2810 if (IS_ERR(ref)) {
2811 err = PTR_ERR(ref);
2812 goto out;
2813 }
2814 BUG_ON(!ref);
2815 if (check_path_shared(root, path))
2816 goto out;
2817 index = btrfs_inode_ref_index(path->nodes[0], ref);
2818 btrfs_release_path(root, path);
2819
2820 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
2821 dentry->d_name.name, dentry->d_name.len, 0);
2822 if (IS_ERR(di)) {
2823 err = PTR_ERR(di);
2824 goto out;
2825 }
2826 BUG_ON(ret == -ENOENT);
2827 if (check_path_shared(root, path))
2828 goto out;
2829
2830 err = 0;
2831out:
2832 btrfs_free_path(path);
2833 if (err) {
2834 btrfs_end_transaction(trans, root);
2835 root->fs_info->enospc_unlink = 0;
2836 return ERR_PTR(err);
2837 }
2838
2839 trans->block_rsv = &root->fs_info->global_block_rsv;
2840 return trans;
2841}
2842
2843static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2844 struct btrfs_root *root)
2845{
2846 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2847 BUG_ON(!root->fs_info->enospc_unlink);
2848 root->fs_info->enospc_unlink = 0;
2849 }
2850 btrfs_end_transaction_throttle(trans, root);
2851}
2852
2853static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2854{
2855 struct btrfs_root *root = BTRFS_I(dir)->root;
2856 struct btrfs_trans_handle *trans;
2857 struct inode *inode = dentry->d_inode;
2858 int ret;
2859 unsigned long nr = 0;
2860
2861 trans = __unlink_start_trans(dir, dentry);
2862 if (IS_ERR(trans))
2863 return PTR_ERR(trans);
2504 2864
2505 btrfs_set_trans_block_group(trans, dir); 2865 btrfs_set_trans_block_group(trans, dir);
2506 2866
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2508 2868
2509 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2869 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2510 dentry->d_name.name, dentry->d_name.len); 2870 dentry->d_name.name, dentry->d_name.len);
2871 BUG_ON(ret);
2511 2872
2512 if (inode->i_nlink == 0) 2873 if (inode->i_nlink == 0) {
2513 ret = btrfs_orphan_add(trans, inode); 2874 ret = btrfs_orphan_add(trans, inode);
2875 BUG_ON(ret);
2876 }
2514 2877
2515 nr = trans->blocks_used; 2878 nr = trans->blocks_used;
2516 2879 __unlink_end_trans(trans, root);
2517 btrfs_end_transaction_throttle(trans, root);
2518 btrfs_unreserve_metadata_space(root, 6);
2519 btrfs_btree_balance_dirty(root, nr); 2880 btrfs_btree_balance_dirty(root, nr);
2520 return ret; 2881 return ret;
2521} 2882}
@@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2587{ 2948{
2588 struct inode *inode = dentry->d_inode; 2949 struct inode *inode = dentry->d_inode;
2589 int err = 0; 2950 int err = 0;
2590 int ret;
2591 struct btrfs_root *root = BTRFS_I(dir)->root; 2951 struct btrfs_root *root = BTRFS_I(dir)->root;
2592 struct btrfs_trans_handle *trans; 2952 struct btrfs_trans_handle *trans;
2593 unsigned long nr = 0; 2953 unsigned long nr = 0;
@@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2596 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2956 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2597 return -ENOTEMPTY; 2957 return -ENOTEMPTY;
2598 2958
2599 ret = btrfs_reserve_metadata_space(root, 5); 2959 trans = __unlink_start_trans(dir, dentry);
2600 if (ret) 2960 if (IS_ERR(trans))
2601 return ret;
2602
2603 trans = btrfs_start_transaction(root, 1);
2604 if (IS_ERR(trans)) {
2605 btrfs_unreserve_metadata_space(root, 5);
2606 return PTR_ERR(trans); 2961 return PTR_ERR(trans);
2607 }
2608 2962
2609 btrfs_set_trans_block_group(trans, dir); 2963 btrfs_set_trans_block_group(trans, dir);
2610 2964
@@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2627 btrfs_i_size_write(inode, 0); 2981 btrfs_i_size_write(inode, 0);
2628out: 2982out:
2629 nr = trans->blocks_used; 2983 nr = trans->blocks_used;
2630 ret = btrfs_end_transaction_throttle(trans, root); 2984 __unlink_end_trans(trans, root);
2631 btrfs_unreserve_metadata_space(root, 5);
2632 btrfs_btree_balance_dirty(root, nr); 2985 btrfs_btree_balance_dirty(root, nr);
2633 2986
2634 if (ret && !err)
2635 err = ret;
2636 return err; 2987 return err;
2637} 2988}
2638 2989
@@ -3029,6 +3380,7 @@ out:
3029 if (pending_del_nr) { 3380 if (pending_del_nr) {
3030 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3381 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3031 pending_del_nr); 3382 pending_del_nr);
3383 BUG_ON(ret);
3032 } 3384 }
3033 btrfs_free_path(path); 3385 btrfs_free_path(path);
3034 return err; 3386 return err;
@@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3056 3408
3057 if ((offset & (blocksize - 1)) == 0) 3409 if ((offset & (blocksize - 1)) == 0)
3058 goto out; 3410 goto out;
3059 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3411 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3060 if (ret)
3061 goto out;
3062
3063 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3064 if (ret) 3412 if (ret)
3065 goto out; 3413 goto out;
3066 3414
@@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3068again: 3416again:
3069 page = grab_cache_page(mapping, index); 3417 page = grab_cache_page(mapping, index);
3070 if (!page) { 3418 if (!page) {
3071 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3419 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3072 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3073 goto out; 3420 goto out;
3074 } 3421 }
3075 3422
@@ -3132,8 +3479,7 @@ again:
3132 3479
3133out_unlock: 3480out_unlock:
3134 if (ret) 3481 if (ret)
3135 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3482 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3136 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3137 unlock_page(page); 3483 unlock_page(page);
3138 page_cache_release(page); 3484 page_cache_release(page);
3139out: 3485out:
@@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3145 struct btrfs_trans_handle *trans; 3491 struct btrfs_trans_handle *trans;
3146 struct btrfs_root *root = BTRFS_I(inode)->root; 3492 struct btrfs_root *root = BTRFS_I(inode)->root;
3147 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3493 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3148 struct extent_map *em; 3494 struct extent_map *em = NULL;
3149 struct extent_state *cached_state = NULL; 3495 struct extent_state *cached_state = NULL;
3150 u64 mask = root->sectorsize - 1; 3496 u64 mask = root->sectorsize - 1;
3151 u64 hole_start = (inode->i_size + mask) & ~mask; 3497 u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3183 u64 hint_byte = 0; 3529 u64 hint_byte = 0;
3184 hole_size = last_byte - cur_offset; 3530 hole_size = last_byte - cur_offset;
3185 3531
3186 err = btrfs_reserve_metadata_space(root, 2); 3532 trans = btrfs_start_transaction(root, 2);
3187 if (err) 3533 if (IS_ERR(trans)) {
3534 err = PTR_ERR(trans);
3188 break; 3535 break;
3189 3536 }
3190 trans = btrfs_start_transaction(root, 1);
3191 btrfs_set_trans_block_group(trans, inode); 3537 btrfs_set_trans_block_group(trans, inode);
3192 3538
3193 err = btrfs_drop_extents(trans, inode, cur_offset, 3539 err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3205 last_byte - 1, 0); 3551 last_byte - 1, 0);
3206 3552
3207 btrfs_end_transaction(trans, root); 3553 btrfs_end_transaction(trans, root);
3208 btrfs_unreserve_metadata_space(root, 2);
3209 } 3554 }
3210 free_extent_map(em); 3555 free_extent_map(em);
3556 em = NULL;
3211 cur_offset = last_byte; 3557 cur_offset = last_byte;
3212 if (cur_offset >= block_end) 3558 if (cur_offset >= block_end)
3213 break; 3559 break;
3214 } 3560 }
3215 3561
3562 free_extent_map(em);
3216 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3563 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3217 GFP_NOFS); 3564 GFP_NOFS);
3218 return err; 3565 return err;
@@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3239 } 3586 }
3240 } 3587 }
3241 3588
3242 ret = btrfs_reserve_metadata_space(root, 1); 3589 trans = btrfs_start_transaction(root, 5);
3243 if (ret) 3590 if (IS_ERR(trans))
3244 return ret; 3591 return PTR_ERR(trans);
3245 3592
3246 trans = btrfs_start_transaction(root, 1);
3247 btrfs_set_trans_block_group(trans, inode); 3593 btrfs_set_trans_block_group(trans, inode);
3248 3594
3249 ret = btrfs_orphan_add(trans, inode); 3595 ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3251 3597
3252 nr = trans->blocks_used; 3598 nr = trans->blocks_used;
3253 btrfs_end_transaction(trans, root); 3599 btrfs_end_transaction(trans, root);
3254 btrfs_unreserve_metadata_space(root, 1);
3255 btrfs_btree_balance_dirty(root, nr); 3600 btrfs_btree_balance_dirty(root, nr);
3256 3601
3257 if (attr->ia_size > inode->i_size) { 3602 if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3264 i_size_write(inode, attr->ia_size); 3609 i_size_write(inode, attr->ia_size);
3265 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3610 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3266 3611
3267 trans = btrfs_start_transaction(root, 1); 3612 trans = btrfs_start_transaction(root, 0);
3613 BUG_ON(IS_ERR(trans));
3268 btrfs_set_trans_block_group(trans, inode); 3614 btrfs_set_trans_block_group(trans, inode);
3615 trans->block_rsv = root->orphan_block_rsv;
3616 BUG_ON(!trans->block_rsv);
3269 3617
3270 ret = btrfs_update_inode(trans, root, inode); 3618 ret = btrfs_update_inode(trans, root, inode);
3271 BUG_ON(ret); 3619 BUG_ON(ret);
@@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
3345 btrfs_i_size_write(inode, 0); 3693 btrfs_i_size_write(inode, 0);
3346 3694
3347 while (1) { 3695 while (1) {
3348 trans = btrfs_start_transaction(root, 1); 3696 trans = btrfs_start_transaction(root, 0);
3697 BUG_ON(IS_ERR(trans));
3349 btrfs_set_trans_block_group(trans, inode); 3698 btrfs_set_trans_block_group(trans, inode);
3350 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3699 trans->block_rsv = root->orphan_block_rsv;
3700
3701 ret = btrfs_block_rsv_check(trans, root,
3702 root->orphan_block_rsv, 0, 5);
3703 if (ret) {
3704 BUG_ON(ret != -EAGAIN);
3705 ret = btrfs_commit_transaction(trans, root);
3706 BUG_ON(ret);
3707 continue;
3708 }
3351 3709
3710 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3352 if (ret != -EAGAIN) 3711 if (ret != -EAGAIN)
3353 break; 3712 break;
3354 3713
@@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
3356 btrfs_end_transaction(trans, root); 3715 btrfs_end_transaction(trans, root);
3357 trans = NULL; 3716 trans = NULL;
3358 btrfs_btree_balance_dirty(root, nr); 3717 btrfs_btree_balance_dirty(root, nr);
3718
3359 } 3719 }
3360 3720
3361 if (ret == 0) { 3721 if (ret == 0) {
@@ -3596,40 +3956,10 @@ again:
3596 return 0; 3956 return 0;
3597} 3957}
3598 3958
3599static noinline void init_btrfs_i(struct inode *inode)
3600{
3601 struct btrfs_inode *bi = BTRFS_I(inode);
3602
3603 bi->generation = 0;
3604 bi->sequence = 0;
3605 bi->last_trans = 0;
3606 bi->last_sub_trans = 0;
3607 bi->logged_trans = 0;
3608 bi->delalloc_bytes = 0;
3609 bi->reserved_bytes = 0;
3610 bi->disk_i_size = 0;
3611 bi->flags = 0;
3612 bi->index_cnt = (u64)-1;
3613 bi->last_unlink_trans = 0;
3614 bi->ordered_data_close = 0;
3615 bi->force_compress = 0;
3616 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3617 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3618 inode->i_mapping, GFP_NOFS);
3619 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3620 inode->i_mapping, GFP_NOFS);
3621 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3622 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3623 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3624 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3625 mutex_init(&BTRFS_I(inode)->log_mutex);
3626}
3627
3628static int btrfs_init_locked_inode(struct inode *inode, void *p) 3959static int btrfs_init_locked_inode(struct inode *inode, void *p)
3629{ 3960{
3630 struct btrfs_iget_args *args = p; 3961 struct btrfs_iget_args *args = p;
3631 inode->i_ino = args->ino; 3962 inode->i_ino = args->ino;
3632 init_btrfs_i(inode);
3633 BTRFS_I(inode)->root = args->root; 3963 BTRFS_I(inode)->root = args->root;
3634 btrfs_set_inode_space_info(args->root, inode); 3964 btrfs_set_inode_space_info(args->root, inode);
3635 return 0; 3965 return 0;
@@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
3692 if (!inode) 4022 if (!inode)
3693 return ERR_PTR(-ENOMEM); 4023 return ERR_PTR(-ENOMEM);
3694 4024
3695 init_btrfs_i(inode);
3696
3697 BTRFS_I(inode)->root = root; 4025 BTRFS_I(inode)->root = root;
3698 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4026 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3699 BTRFS_I(inode)->dummy_inode = 1; 4027 BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3950 struct btrfs_trans_handle *trans; 4278 struct btrfs_trans_handle *trans;
3951 int ret = 0; 4279 int ret = 0;
3952 4280
3953 if (root->fs_info->btree_inode == inode) 4281 if (BTRFS_I(inode)->dummy_inode)
3954 return 0; 4282 return 0;
3955 4283
3956 if (wbc->sync_mode == WB_SYNC_ALL) { 4284 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
3971{ 4299{
3972 struct btrfs_root *root = BTRFS_I(inode)->root; 4300 struct btrfs_root *root = BTRFS_I(inode)->root;
3973 struct btrfs_trans_handle *trans; 4301 struct btrfs_trans_handle *trans;
4302 int ret;
4303
4304 if (BTRFS_I(inode)->dummy_inode)
4305 return;
3974 4306
3975 trans = btrfs_join_transaction(root, 1); 4307 trans = btrfs_join_transaction(root, 1);
3976 btrfs_set_trans_block_group(trans, inode); 4308 btrfs_set_trans_block_group(trans, inode);
3977 btrfs_update_inode(trans, root, inode); 4309
4310 ret = btrfs_update_inode(trans, root, inode);
4311 if (ret && ret == -ENOSPC) {
4312 /* whoops, lets try again with the full transaction */
4313 btrfs_end_transaction(trans, root);
4314 trans = btrfs_start_transaction(root, 1);
4315 if (IS_ERR(trans)) {
4316 if (printk_ratelimit()) {
4317 printk(KERN_ERR "btrfs: fail to "
4318 "dirty inode %lu error %ld\n",
4319 inode->i_ino, PTR_ERR(trans));
4320 }
4321 return;
4322 }
4323 btrfs_set_trans_block_group(trans, inode);
4324
4325 ret = btrfs_update_inode(trans, root, inode);
4326 if (ret) {
4327 if (printk_ratelimit()) {
4328 printk(KERN_ERR "btrfs: fail to "
4329 "dirty inode %lu error %d\n",
4330 inode->i_ino, ret);
4331 }
4332 }
4333 }
3978 btrfs_end_transaction(trans, root); 4334 btrfs_end_transaction(trans, root);
3979} 4335}
3980 4336
@@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4092 * btrfs_get_inode_index_count has an explanation for the magic 4448 * btrfs_get_inode_index_count has an explanation for the magic
4093 * number 4449 * number
4094 */ 4450 */
4095 init_btrfs_i(inode);
4096 BTRFS_I(inode)->index_cnt = 2; 4451 BTRFS_I(inode)->index_cnt = 2;
4097 BTRFS_I(inode)->root = root; 4452 BTRFS_I(inode)->root = root;
4098 BTRFS_I(inode)->generation = trans->transid; 4453 BTRFS_I(inode)->generation = trans->transid;
@@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4247 if (!new_valid_dev(rdev)) 4602 if (!new_valid_dev(rdev))
4248 return -EINVAL; 4603 return -EINVAL;
4249 4604
4605 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4606 if (err)
4607 return err;
4608
4250 /* 4609 /*
4251 * 2 for inode item and ref 4610 * 2 for inode item and ref
4252 * 2 for dir items 4611 * 2 for dir items
4253 * 1 for xattr if selinux is on 4612 * 1 for xattr if selinux is on
4254 */ 4613 */
4255 err = btrfs_reserve_metadata_space(root, 5); 4614 trans = btrfs_start_transaction(root, 5);
4256 if (err) 4615 if (IS_ERR(trans))
4257 return err; 4616 return PTR_ERR(trans);
4258 4617
4259 trans = btrfs_start_transaction(root, 1);
4260 if (!trans)
4261 goto fail;
4262 btrfs_set_trans_block_group(trans, dir); 4618 btrfs_set_trans_block_group(trans, dir);
4263 4619
4264 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4265 if (err) {
4266 err = -ENOSPC;
4267 goto out_unlock;
4268 }
4269
4270 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4620 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4271 dentry->d_name.len, 4621 dentry->d_name.len,
4272 dentry->d_parent->d_inode->i_ino, objectid, 4622 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4295out_unlock: 4645out_unlock:
4296 nr = trans->blocks_used; 4646 nr = trans->blocks_used;
4297 btrfs_end_transaction_throttle(trans, root); 4647 btrfs_end_transaction_throttle(trans, root);
4298fail: 4648 btrfs_btree_balance_dirty(root, nr);
4299 btrfs_unreserve_metadata_space(root, 5);
4300 if (drop_inode) { 4649 if (drop_inode) {
4301 inode_dec_link_count(inode); 4650 inode_dec_link_count(inode);
4302 iput(inode); 4651 iput(inode);
4303 } 4652 }
4304 btrfs_btree_balance_dirty(root, nr);
4305 return err; 4653 return err;
4306} 4654}
4307 4655
@@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4311 struct btrfs_trans_handle *trans; 4659 struct btrfs_trans_handle *trans;
4312 struct btrfs_root *root = BTRFS_I(dir)->root; 4660 struct btrfs_root *root = BTRFS_I(dir)->root;
4313 struct inode *inode = NULL; 4661 struct inode *inode = NULL;
4314 int err;
4315 int drop_inode = 0; 4662 int drop_inode = 0;
4663 int err;
4316 unsigned long nr = 0; 4664 unsigned long nr = 0;
4317 u64 objectid; 4665 u64 objectid;
4318 u64 index = 0; 4666 u64 index = 0;
4319 4667
4668 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4669 if (err)
4670 return err;
4320 /* 4671 /*
4321 * 2 for inode item and ref 4672 * 2 for inode item and ref
4322 * 2 for dir items 4673 * 2 for dir items
4323 * 1 for xattr if selinux is on 4674 * 1 for xattr if selinux is on
4324 */ 4675 */
4325 err = btrfs_reserve_metadata_space(root, 5); 4676 trans = btrfs_start_transaction(root, 5);
4326 if (err) 4677 if (IS_ERR(trans))
4327 return err; 4678 return PTR_ERR(trans);
4328 4679
4329 trans = btrfs_start_transaction(root, 1);
4330 if (!trans)
4331 goto fail;
4332 btrfs_set_trans_block_group(trans, dir); 4680 btrfs_set_trans_block_group(trans, dir);
4333 4681
4334 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4335 if (err) {
4336 err = -ENOSPC;
4337 goto out_unlock;
4338 }
4339
4340 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4682 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4341 dentry->d_name.len, 4683 dentry->d_name.len,
4342 dentry->d_parent->d_inode->i_ino, 4684 dentry->d_parent->d_inode->i_ino,
@@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4368out_unlock: 4710out_unlock:
4369 nr = trans->blocks_used; 4711 nr = trans->blocks_used;
4370 btrfs_end_transaction_throttle(trans, root); 4712 btrfs_end_transaction_throttle(trans, root);
4371fail:
4372 btrfs_unreserve_metadata_space(root, 5);
4373 if (drop_inode) { 4713 if (drop_inode) {
4374 inode_dec_link_count(inode); 4714 inode_dec_link_count(inode);
4375 iput(inode); 4715 iput(inode);
@@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4396 if (root->objectid != BTRFS_I(inode)->root->objectid) 4736 if (root->objectid != BTRFS_I(inode)->root->objectid)
4397 return -EPERM; 4737 return -EPERM;
4398 4738
4399 /*
4400 * 1 item for inode ref
4401 * 2 items for dir items
4402 */
4403 err = btrfs_reserve_metadata_space(root, 3);
4404 if (err)
4405 return err;
4406
4407 btrfs_inc_nlink(inode); 4739 btrfs_inc_nlink(inode);
4408 4740
4409 err = btrfs_set_inode_index(dir, &index); 4741 err = btrfs_set_inode_index(dir, &index);
4410 if (err) 4742 if (err)
4411 goto fail; 4743 goto fail;
4412 4744
4413 trans = btrfs_start_transaction(root, 1); 4745 /*
4746 * 1 item for inode ref
4747 * 2 items for dir items
4748 */
4749 trans = btrfs_start_transaction(root, 3);
4750 if (IS_ERR(trans)) {
4751 err = PTR_ERR(trans);
4752 goto fail;
4753 }
4414 4754
4415 btrfs_set_trans_block_group(trans, dir); 4755 btrfs_set_trans_block_group(trans, dir);
4416 atomic_inc(&inode->i_count); 4756 atomic_inc(&inode->i_count);
@@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4429 nr = trans->blocks_used; 4769 nr = trans->blocks_used;
4430 btrfs_end_transaction_throttle(trans, root); 4770 btrfs_end_transaction_throttle(trans, root);
4431fail: 4771fail:
4432 btrfs_unreserve_metadata_space(root, 3);
4433 if (drop_inode) { 4772 if (drop_inode) {
4434 inode_dec_link_count(inode); 4773 inode_dec_link_count(inode);
4435 iput(inode); 4774 iput(inode);
@@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4449 u64 index = 0; 4788 u64 index = 0;
4450 unsigned long nr = 1; 4789 unsigned long nr = 1;
4451 4790
4791 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4792 if (err)
4793 return err;
4794
4452 /* 4795 /*
4453 * 2 items for inode and ref 4796 * 2 items for inode and ref
4454 * 2 items for dir items 4797 * 2 items for dir items
4455 * 1 for xattr if selinux is on 4798 * 1 for xattr if selinux is on
4456 */ 4799 */
4457 err = btrfs_reserve_metadata_space(root, 5); 4800 trans = btrfs_start_transaction(root, 5);
4458 if (err) 4801 if (IS_ERR(trans))
4459 return err; 4802 return PTR_ERR(trans);
4460
4461 trans = btrfs_start_transaction(root, 1);
4462 if (!trans) {
4463 err = -ENOMEM;
4464 goto out_unlock;
4465 }
4466 btrfs_set_trans_block_group(trans, dir); 4803 btrfs_set_trans_block_group(trans, dir);
4467 4804
4468 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4469 if (err) {
4470 err = -ENOSPC;
4471 goto out_fail;
4472 }
4473
4474 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4805 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4475 dentry->d_name.len, 4806 dentry->d_name.len,
4476 dentry->d_parent->d_inode->i_ino, objectid, 4807 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4510out_fail: 4841out_fail:
4511 nr = trans->blocks_used; 4842 nr = trans->blocks_used;
4512 btrfs_end_transaction_throttle(trans, root); 4843 btrfs_end_transaction_throttle(trans, root);
4513
4514out_unlock:
4515 btrfs_unreserve_metadata_space(root, 5);
4516 if (drop_on_err) 4844 if (drop_on_err)
4517 iput(inode); 4845 iput(inode);
4518 btrfs_btree_balance_dirty(root, nr); 4846 btrfs_btree_balance_dirty(root, nr);
@@ -4770,6 +5098,7 @@ again:
4770 } 5098 }
4771 flush_dcache_page(page); 5099 flush_dcache_page(page);
4772 } else if (create && PageUptodate(page)) { 5100 } else if (create && PageUptodate(page)) {
5101 WARN_ON(1);
4773 if (!trans) { 5102 if (!trans) {
4774 kunmap(page); 5103 kunmap(page);
4775 free_extent_map(em); 5104 free_extent_map(em);
@@ -4866,11 +5195,651 @@ out:
4866 return em; 5195 return em;
4867} 5196}
4868 5197
5198static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5199 u64 start, u64 len)
5200{
5201 struct btrfs_root *root = BTRFS_I(inode)->root;
5202 struct btrfs_trans_handle *trans;
5203 struct extent_map *em;
5204 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5205 struct btrfs_key ins;
5206 u64 alloc_hint;
5207 int ret;
5208
5209 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5210
5211 trans = btrfs_join_transaction(root, 0);
5212 if (!trans)
5213 return ERR_PTR(-ENOMEM);
5214
5215 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5216
5217 alloc_hint = get_extent_allocation_hint(inode, start, len);
5218 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5219 alloc_hint, (u64)-1, &ins, 1);
5220 if (ret) {
5221 em = ERR_PTR(ret);
5222 goto out;
5223 }
5224
5225 em = alloc_extent_map(GFP_NOFS);
5226 if (!em) {
5227 em = ERR_PTR(-ENOMEM);
5228 goto out;
5229 }
5230
5231 em->start = start;
5232 em->orig_start = em->start;
5233 em->len = ins.offset;
5234
5235 em->block_start = ins.objectid;
5236 em->block_len = ins.offset;
5237 em->bdev = root->fs_info->fs_devices->latest_bdev;
5238 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5239
5240 while (1) {
5241 write_lock(&em_tree->lock);
5242 ret = add_extent_mapping(em_tree, em);
5243 write_unlock(&em_tree->lock);
5244 if (ret != -EEXIST)
5245 break;
5246 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5247 }
5248
5249 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5250 ins.offset, ins.offset, 0);
5251 if (ret) {
5252 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5253 em = ERR_PTR(ret);
5254 }
5255out:
5256 btrfs_end_transaction(trans, root);
5257 return em;
5258}
5259
5260/*
5261 * returns 1 when the nocow is safe, < 1 on error, 0 if the
5262 * block must be cow'd
5263 */
5264static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5265 struct inode *inode, u64 offset, u64 len)
5266{
5267 struct btrfs_path *path;
5268 int ret;
5269 struct extent_buffer *leaf;
5270 struct btrfs_root *root = BTRFS_I(inode)->root;
5271 struct btrfs_file_extent_item *fi;
5272 struct btrfs_key key;
5273 u64 disk_bytenr;
5274 u64 backref_offset;
5275 u64 extent_end;
5276 u64 num_bytes;
5277 int slot;
5278 int found_type;
5279
5280 path = btrfs_alloc_path();
5281 if (!path)
5282 return -ENOMEM;
5283
5284 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
5285 offset, 0);
5286 if (ret < 0)
5287 goto out;
5288
5289 slot = path->slots[0];
5290 if (ret == 1) {
5291 if (slot == 0) {
5292 /* can't find the item, must cow */
5293 ret = 0;
5294 goto out;
5295 }
5296 slot--;
5297 }
5298 ret = 0;
5299 leaf = path->nodes[0];
5300 btrfs_item_key_to_cpu(leaf, &key, slot);
5301 if (key.objectid != inode->i_ino ||
5302 key.type != BTRFS_EXTENT_DATA_KEY) {
5303 /* not our file or wrong item type, must cow */
5304 goto out;
5305 }
5306
5307 if (key.offset > offset) {
5308 /* Wrong offset, must cow */
5309 goto out;
5310 }
5311
5312 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5313 found_type = btrfs_file_extent_type(leaf, fi);
5314 if (found_type != BTRFS_FILE_EXTENT_REG &&
5315 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5316 /* not a regular extent, must cow */
5317 goto out;
5318 }
5319 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5320 backref_offset = btrfs_file_extent_offset(leaf, fi);
5321
5322 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5323 if (extent_end < offset + len) {
5324 /* extent doesn't include our full range, must cow */
5325 goto out;
5326 }
5327
5328 if (btrfs_extent_readonly(root, disk_bytenr))
5329 goto out;
5330
5331 /*
5332 * look for other files referencing this extent, if we
5333 * find any we must cow
5334 */
5335 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
5336 key.offset - backref_offset, disk_bytenr))
5337 goto out;
5338
5339 /*
5340 * adjust disk_bytenr and num_bytes to cover just the bytes
5341 * in this extent we are about to write. If there
5342 * are any csums in that range we have to cow in order
5343 * to keep the csums correct
5344 */
5345 disk_bytenr += backref_offset;
5346 disk_bytenr += offset - key.offset;
5347 num_bytes = min(offset + len, extent_end) - offset;
5348 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5349 goto out;
5350 /*
5351 * all of the above have passed, it is safe to overwrite this extent
5352 * without cow
5353 */
5354 ret = 1;
5355out:
5356 btrfs_free_path(path);
5357 return ret;
5358}
5359
5360static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5361 struct buffer_head *bh_result, int create)
5362{
5363 struct extent_map *em;
5364 struct btrfs_root *root = BTRFS_I(inode)->root;
5365 u64 start = iblock << inode->i_blkbits;
5366 u64 len = bh_result->b_size;
5367 struct btrfs_trans_handle *trans;
5368
5369 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5370 if (IS_ERR(em))
5371 return PTR_ERR(em);
5372
5373 /*
5374 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5375 * io. INLINE is special, and we could probably kludge it in here, but
5376 * it's still buffered so for safety lets just fall back to the generic
5377 * buffered path.
5378 *
5379 * For COMPRESSED we _have_ to read the entire extent in so we can
5380 * decompress it, so there will be buffering required no matter what we
5381 * do, so go ahead and fallback to buffered.
5382 *
5383 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5384 * to buffered IO. Don't blame me, this is the price we pay for using
5385 * the generic code.
5386 */
5387 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5388 em->block_start == EXTENT_MAP_INLINE) {
5389 free_extent_map(em);
5390 return -ENOTBLK;
5391 }
5392
5393 /* Just a good old fashioned hole, return */
5394 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5395 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5396 free_extent_map(em);
5397 /* DIO will do one hole at a time, so just unlock a sector */
5398 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5399 start + root->sectorsize - 1, GFP_NOFS);
5400 return 0;
5401 }
5402
5403 /*
5404 * We don't allocate a new extent in the following cases
5405 *
5406 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5407 * existing extent.
5408 * 2) The extent is marked as PREALLOC. We're good to go here and can
5409 * just use the extent.
5410 *
5411 */
5412 if (!create) {
5413 len = em->len - (start - em->start);
5414 goto map;
5415 }
5416
5417 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5418 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5419 em->block_start != EXTENT_MAP_HOLE)) {
5420 int type;
5421 int ret;
5422 u64 block_start;
5423
5424 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5425 type = BTRFS_ORDERED_PREALLOC;
5426 else
5427 type = BTRFS_ORDERED_NOCOW;
5428 len = min(len, em->len - (start - em->start));
5429 block_start = em->block_start + (start - em->start);
5430
5431 /*
5432 * we're not going to log anything, but we do need
5433 * to make sure the current transaction stays open
5434 * while we look for nocow cross refs
5435 */
5436 trans = btrfs_join_transaction(root, 0);
5437 if (!trans)
5438 goto must_cow;
5439
5440 if (can_nocow_odirect(trans, inode, start, len) == 1) {
5441 ret = btrfs_add_ordered_extent_dio(inode, start,
5442 block_start, len, len, type);
5443 btrfs_end_transaction(trans, root);
5444 if (ret) {
5445 free_extent_map(em);
5446 return ret;
5447 }
5448 goto unlock;
5449 }
5450 btrfs_end_transaction(trans, root);
5451 }
5452must_cow:
5453 /*
5454 * this will cow the extent, reset the len in case we changed
5455 * it above
5456 */
5457 len = bh_result->b_size;
5458 free_extent_map(em);
5459 em = btrfs_new_extent_direct(inode, start, len);
5460 if (IS_ERR(em))
5461 return PTR_ERR(em);
5462 len = min(len, em->len - (start - em->start));
5463unlock:
5464 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5465 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5466 0, NULL, GFP_NOFS);
5467map:
5468 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5469 inode->i_blkbits;
5470 bh_result->b_size = len;
5471 bh_result->b_bdev = em->bdev;
5472 set_buffer_mapped(bh_result);
5473 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5474 set_buffer_new(bh_result);
5475
5476 free_extent_map(em);
5477
5478 return 0;
5479}
5480
5481struct btrfs_dio_private {
5482 struct inode *inode;
5483 u64 logical_offset;
5484 u64 disk_bytenr;
5485 u64 bytes;
5486 u32 *csums;
5487 void *private;
5488};
5489
5490static void btrfs_endio_direct_read(struct bio *bio, int err)
5491{
5492 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5493 struct bio_vec *bvec = bio->bi_io_vec;
5494 struct btrfs_dio_private *dip = bio->bi_private;
5495 struct inode *inode = dip->inode;
5496 struct btrfs_root *root = BTRFS_I(inode)->root;
5497 u64 start;
5498 u32 *private = dip->csums;
5499
5500 start = dip->logical_offset;
5501 do {
5502 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5503 struct page *page = bvec->bv_page;
5504 char *kaddr;
5505 u32 csum = ~(u32)0;
5506 unsigned long flags;
5507
5508 local_irq_save(flags);
5509 kaddr = kmap_atomic(page, KM_IRQ0);
5510 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5511 csum, bvec->bv_len);
5512 btrfs_csum_final(csum, (char *)&csum);
5513 kunmap_atomic(kaddr, KM_IRQ0);
5514 local_irq_restore(flags);
5515
5516 flush_dcache_page(bvec->bv_page);
5517 if (csum != *private) {
5518 printk(KERN_ERR "btrfs csum failed ino %lu off"
5519 " %llu csum %u private %u\n",
5520 inode->i_ino, (unsigned long long)start,
5521 csum, *private);
5522 err = -EIO;
5523 }
5524 }
5525
5526 start += bvec->bv_len;
5527 private++;
5528 bvec++;
5529 } while (bvec <= bvec_end);
5530
5531 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5532 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5533 bio->bi_private = dip->private;
5534
5535 kfree(dip->csums);
5536 kfree(dip);
5537 dio_end_io(bio, err);
5538}
5539
5540static void btrfs_endio_direct_write(struct bio *bio, int err)
5541{
5542 struct btrfs_dio_private *dip = bio->bi_private;
5543 struct inode *inode = dip->inode;
5544 struct btrfs_root *root = BTRFS_I(inode)->root;
5545 struct btrfs_trans_handle *trans;
5546 struct btrfs_ordered_extent *ordered = NULL;
5547 struct extent_state *cached_state = NULL;
5548 int ret;
5549
5550 if (err)
5551 goto out_done;
5552
5553 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5554 dip->logical_offset, dip->bytes);
5555 if (!ret)
5556 goto out_done;
5557
5558 BUG_ON(!ordered);
5559
5560 trans = btrfs_join_transaction(root, 1);
5561 if (!trans) {
5562 err = -ENOMEM;
5563 goto out;
5564 }
5565 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5566
5567 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5568 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5569 if (!ret)
5570 ret = btrfs_update_inode(trans, root, inode);
5571 err = ret;
5572 goto out;
5573 }
5574
5575 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5576 ordered->file_offset + ordered->len - 1, 0,
5577 &cached_state, GFP_NOFS);
5578
5579 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5580 ret = btrfs_mark_extent_written(trans, inode,
5581 ordered->file_offset,
5582 ordered->file_offset +
5583 ordered->len);
5584 if (ret) {
5585 err = ret;
5586 goto out_unlock;
5587 }
5588 } else {
5589 ret = insert_reserved_file_extent(trans, inode,
5590 ordered->file_offset,
5591 ordered->start,
5592 ordered->disk_len,
5593 ordered->len,
5594 ordered->len,
5595 0, 0, 0,
5596 BTRFS_FILE_EXTENT_REG);
5597 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5598 ordered->file_offset, ordered->len);
5599 if (ret) {
5600 err = ret;
5601 WARN_ON(1);
5602 goto out_unlock;
5603 }
5604 }
5605
5606 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5607 btrfs_ordered_update_i_size(inode, 0, ordered);
5608 btrfs_update_inode(trans, root, inode);
5609out_unlock:
5610 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5611 ordered->file_offset + ordered->len - 1,
5612 &cached_state, GFP_NOFS);
5613out:
5614 btrfs_delalloc_release_metadata(inode, ordered->len);
5615 btrfs_end_transaction(trans, root);
5616 btrfs_put_ordered_extent(ordered);
5617 btrfs_put_ordered_extent(ordered);
5618out_done:
5619 bio->bi_private = dip->private;
5620
5621 kfree(dip->csums);
5622 kfree(dip);
5623 dio_end_io(bio, err);
5624}
5625
5626static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5627 struct bio *bio, int mirror_num,
5628 unsigned long bio_flags, u64 offset)
5629{
5630 int ret;
5631 struct btrfs_root *root = BTRFS_I(inode)->root;
5632 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5633 BUG_ON(ret);
5634 return 0;
5635}
5636
5637static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5638 loff_t file_offset)
5639{
5640 struct btrfs_root *root = BTRFS_I(inode)->root;
5641 struct btrfs_dio_private *dip;
5642 struct bio_vec *bvec = bio->bi_io_vec;
5643 u64 start;
5644 int skip_sum;
5645 int write = rw & (1 << BIO_RW);
5646 int ret = 0;
5647
5648 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5649
5650 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5651 if (!dip) {
5652 ret = -ENOMEM;
5653 goto free_ordered;
5654 }
5655 dip->csums = NULL;
5656
5657 if (!skip_sum) {
5658 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5659 if (!dip->csums) {
5660 ret = -ENOMEM;
5661 goto free_ordered;
5662 }
5663 }
5664
5665 dip->private = bio->bi_private;
5666 dip->inode = inode;
5667 dip->logical_offset = file_offset;
5668
5669 start = dip->logical_offset;
5670 dip->bytes = 0;
5671 do {
5672 dip->bytes += bvec->bv_len;
5673 bvec++;
5674 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5675
5676 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5677 bio->bi_private = dip;
5678
5679 if (write)
5680 bio->bi_end_io = btrfs_endio_direct_write;
5681 else
5682 bio->bi_end_io = btrfs_endio_direct_read;
5683
5684 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5685 if (ret)
5686 goto out_err;
5687
5688 if (write && !skip_sum) {
5689 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5690 inode, rw, bio, 0, 0,
5691 dip->logical_offset,
5692 __btrfs_submit_bio_start_direct_io,
5693 __btrfs_submit_bio_done);
5694 if (ret)
5695 goto out_err;
5696 return;
5697 } else if (!skip_sum)
5698 btrfs_lookup_bio_sums_dio(root, inode, bio,
5699 dip->logical_offset, dip->csums);
5700
5701 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5702 if (ret)
5703 goto out_err;
5704 return;
5705out_err:
5706 kfree(dip->csums);
5707 kfree(dip);
5708free_ordered:
5709 /*
5710 * If this is a write, we need to clean up the reserved space and kill
5711 * the ordered extent.
5712 */
5713 if (write) {
5714 struct btrfs_ordered_extent *ordered;
5715 ordered = btrfs_lookup_ordered_extent(inode,
5716 dip->logical_offset);
5717 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5718 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5719 btrfs_free_reserved_extent(root, ordered->start,
5720 ordered->disk_len);
5721 btrfs_put_ordered_extent(ordered);
5722 btrfs_put_ordered_extent(ordered);
5723 }
5724 bio_endio(bio, ret);
5725}
5726
5727static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
5728 const struct iovec *iov, loff_t offset,
5729 unsigned long nr_segs)
5730{
5731 int seg;
5732 size_t size;
5733 unsigned long addr;
5734 unsigned blocksize_mask = root->sectorsize - 1;
5735 ssize_t retval = -EINVAL;
5736 loff_t end = offset;
5737
5738 if (offset & blocksize_mask)
5739 goto out;
5740
5741 /* Check the memory alignment. Blocks cannot straddle pages */
5742 for (seg = 0; seg < nr_segs; seg++) {
5743 addr = (unsigned long)iov[seg].iov_base;
5744 size = iov[seg].iov_len;
5745 end += size;
5746 if ((addr & blocksize_mask) || (size & blocksize_mask))
5747 goto out;
5748 }
5749 retval = 0;
5750out:
5751 return retval;
5752}
4869static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5753static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4870 const struct iovec *iov, loff_t offset, 5754 const struct iovec *iov, loff_t offset,
4871 unsigned long nr_segs) 5755 unsigned long nr_segs)
4872{ 5756{
4873 return -EINVAL; 5757 struct file *file = iocb->ki_filp;
5758 struct inode *inode = file->f_mapping->host;
5759 struct btrfs_ordered_extent *ordered;
5760 struct extent_state *cached_state = NULL;
5761 u64 lockstart, lockend;
5762 ssize_t ret;
5763 int writing = rw & WRITE;
5764 int write_bits = 0;
5765 size_t count = iov_length(iov, nr_segs);
5766
5767 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
5768 offset, nr_segs)) {
5769 return 0;
5770 }
5771
5772 lockstart = offset;
5773 lockend = offset + count - 1;
5774
5775 if (writing) {
5776 ret = btrfs_delalloc_reserve_space(inode, count);
5777 if (ret)
5778 goto out;
5779 }
5780
5781 while (1) {
5782 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5783 0, &cached_state, GFP_NOFS);
5784 /*
5785 * We're concerned with the entire range that we're going to be
5786 * doing DIO to, so we need to make sure theres no ordered
5787 * extents in this range.
5788 */
5789 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5790 lockend - lockstart + 1);
5791 if (!ordered)
5792 break;
5793 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5794 &cached_state, GFP_NOFS);
5795 btrfs_start_ordered_extent(inode, ordered, 1);
5796 btrfs_put_ordered_extent(ordered);
5797 cond_resched();
5798 }
5799
5800 /*
5801 * we don't use btrfs_set_extent_delalloc because we don't want
5802 * the dirty or uptodate bits
5803 */
5804 if (writing) {
5805 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
5806 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5807 EXTENT_DELALLOC, 0, NULL, &cached_state,
5808 GFP_NOFS);
5809 if (ret) {
5810 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5811 lockend, EXTENT_LOCKED | write_bits,
5812 1, 0, &cached_state, GFP_NOFS);
5813 goto out;
5814 }
5815 }
5816
5817 free_extent_state(cached_state);
5818 cached_state = NULL;
5819
5820 ret = __blockdev_direct_IO(rw, iocb, inode,
5821 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
5822 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
5823 btrfs_submit_direct, 0);
5824
5825 if (ret < 0 && ret != -EIOCBQUEUED) {
5826 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
5827 offset + iov_length(iov, nr_segs) - 1,
5828 EXTENT_LOCKED | write_bits, 1, 0,
5829 &cached_state, GFP_NOFS);
5830 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5831 /*
5832 * We're falling back to buffered, unlock the section we didn't
5833 * do IO on.
5834 */
5835 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
5836 offset + iov_length(iov, nr_segs) - 1,
5837 EXTENT_LOCKED | write_bits, 1, 0,
5838 &cached_state, GFP_NOFS);
5839 }
5840out:
5841 free_extent_state(cached_state);
5842 return ret;
4874} 5843}
4875 5844
4876static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5845static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5034 u64 page_start; 6003 u64 page_start;
5035 u64 page_end; 6004 u64 page_end;
5036 6005
5037 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 6006 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5038 if (ret) { 6007 if (ret) {
5039 if (ret == -ENOMEM) 6008 if (ret == -ENOMEM)
5040 ret = VM_FAULT_OOM; 6009 ret = VM_FAULT_OOM;
@@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5043 goto out; 6012 goto out;
5044 } 6013 }
5045 6014
5046 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5047 if (ret) {
5048 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5049 ret = VM_FAULT_SIGBUS;
5050 goto out;
5051 }
5052
5053 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6015 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5054again: 6016again:
5055 lock_page(page); 6017 lock_page(page);
@@ -5059,7 +6021,6 @@ again:
5059 6021
5060 if ((page->mapping != inode->i_mapping) || 6022 if ((page->mapping != inode->i_mapping) ||
5061 (page_start >= size)) { 6023 (page_start >= size)) {
5062 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5063 /* page got truncated out from underneath us */ 6024 /* page got truncated out from underneath us */
5064 goto out_unlock; 6025 goto out_unlock;
5065 } 6026 }
@@ -5100,7 +6061,6 @@ again:
5100 unlock_extent_cached(io_tree, page_start, page_end, 6061 unlock_extent_cached(io_tree, page_start, page_end,
5101 &cached_state, GFP_NOFS); 6062 &cached_state, GFP_NOFS);
5102 ret = VM_FAULT_SIGBUS; 6063 ret = VM_FAULT_SIGBUS;
5103 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5104 goto out_unlock; 6064 goto out_unlock;
5105 } 6065 }
5106 ret = 0; 6066 ret = 0;
@@ -5127,10 +6087,10 @@ again:
5127 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6087 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5128 6088
5129out_unlock: 6089out_unlock:
5130 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5131 if (!ret) 6090 if (!ret)
5132 return VM_FAULT_LOCKED; 6091 return VM_FAULT_LOCKED;
5133 unlock_page(page); 6092 unlock_page(page);
6093 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
5134out: 6094out:
5135 return ret; 6095 return ret;
5136} 6096}
@@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
5155 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6115 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5156 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6116 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
5157 6117
5158 trans = btrfs_start_transaction(root, 1); 6118 trans = btrfs_start_transaction(root, 0);
6119 BUG_ON(IS_ERR(trans));
5159 btrfs_set_trans_block_group(trans, inode); 6120 btrfs_set_trans_block_group(trans, inode);
6121 trans->block_rsv = root->orphan_block_rsv;
5160 6122
5161 /* 6123 /*
5162 * setattr is responsible for setting the ordered_data_close flag, 6124 * setattr is responsible for setting the ordered_data_close flag,
@@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
5179 btrfs_add_ordered_operation(trans, root, inode); 6141 btrfs_add_ordered_operation(trans, root, inode);
5180 6142
5181 while (1) { 6143 while (1) {
6144 if (!trans) {
6145 trans = btrfs_start_transaction(root, 0);
6146 BUG_ON(IS_ERR(trans));
6147 btrfs_set_trans_block_group(trans, inode);
6148 trans->block_rsv = root->orphan_block_rsv;
6149 }
6150
6151 ret = btrfs_block_rsv_check(trans, root,
6152 root->orphan_block_rsv, 0, 5);
6153 if (ret) {
6154 BUG_ON(ret != -EAGAIN);
6155 ret = btrfs_commit_transaction(trans, root);
6156 BUG_ON(ret);
6157 trans = NULL;
6158 continue;
6159 }
6160
5182 ret = btrfs_truncate_inode_items(trans, root, inode, 6161 ret = btrfs_truncate_inode_items(trans, root, inode,
5183 inode->i_size, 6162 inode->i_size,
5184 BTRFS_EXTENT_DATA_KEY); 6163 BTRFS_EXTENT_DATA_KEY);
@@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
5190 6169
5191 nr = trans->blocks_used; 6170 nr = trans->blocks_used;
5192 btrfs_end_transaction(trans, root); 6171 btrfs_end_transaction(trans, root);
6172 trans = NULL;
5193 btrfs_btree_balance_dirty(root, nr); 6173 btrfs_btree_balance_dirty(root, nr);
5194
5195 trans = btrfs_start_transaction(root, 1);
5196 btrfs_set_trans_block_group(trans, inode);
5197 } 6174 }
5198 6175
5199 if (ret == 0 && inode->i_nlink > 0) { 6176 if (ret == 0 && inode->i_nlink > 0) {
@@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
5254struct inode *btrfs_alloc_inode(struct super_block *sb) 6231struct inode *btrfs_alloc_inode(struct super_block *sb)
5255{ 6232{
5256 struct btrfs_inode *ei; 6233 struct btrfs_inode *ei;
6234 struct inode *inode;
5257 6235
5258 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6236 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5259 if (!ei) 6237 if (!ei)
5260 return NULL; 6238 return NULL;
6239
6240 ei->root = NULL;
6241 ei->space_info = NULL;
6242 ei->generation = 0;
6243 ei->sequence = 0;
5261 ei->last_trans = 0; 6244 ei->last_trans = 0;
5262 ei->last_sub_trans = 0; 6245 ei->last_sub_trans = 0;
5263 ei->logged_trans = 0; 6246 ei->logged_trans = 0;
5264 ei->outstanding_extents = 0; 6247 ei->delalloc_bytes = 0;
5265 ei->reserved_extents = 0; 6248 ei->reserved_bytes = 0;
5266 ei->root = NULL; 6249 ei->disk_i_size = 0;
6250 ei->flags = 0;
6251 ei->index_cnt = (u64)-1;
6252 ei->last_unlink_trans = 0;
6253
5267 spin_lock_init(&ei->accounting_lock); 6254 spin_lock_init(&ei->accounting_lock);
6255 atomic_set(&ei->outstanding_extents, 0);
6256 ei->reserved_extents = 0;
6257
6258 ei->ordered_data_close = 0;
6259 ei->orphan_meta_reserved = 0;
6260 ei->dummy_inode = 0;
6261 ei->force_compress = 0;
6262
6263 inode = &ei->vfs_inode;
6264 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
6265 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
6266 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
6267 mutex_init(&ei->log_mutex);
5268 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6268 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5269 INIT_LIST_HEAD(&ei->i_orphan); 6269 INIT_LIST_HEAD(&ei->i_orphan);
6270 INIT_LIST_HEAD(&ei->delalloc_inodes);
5270 INIT_LIST_HEAD(&ei->ordered_operations); 6271 INIT_LIST_HEAD(&ei->ordered_operations);
5271 return &ei->vfs_inode; 6272 RB_CLEAR_NODE(&ei->rb_node);
6273
6274 return inode;
5272} 6275}
5273 6276
5274void btrfs_destroy_inode(struct inode *inode) 6277void btrfs_destroy_inode(struct inode *inode)
@@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
5278 6281
5279 WARN_ON(!list_empty(&inode->i_dentry)); 6282 WARN_ON(!list_empty(&inode->i_dentry));
5280 WARN_ON(inode->i_data.nrpages); 6283 WARN_ON(inode->i_data.nrpages);
6284 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6285 WARN_ON(BTRFS_I(inode)->reserved_extents);
5281 6286
5282 /* 6287 /*
5283 * This can happen where we create an inode, but somebody else also 6288 * This can happen where we create an inode, but somebody else also
@@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
5298 spin_unlock(&root->fs_info->ordered_extent_lock); 6303 spin_unlock(&root->fs_info->ordered_extent_lock);
5299 } 6304 }
5300 6305
5301 spin_lock(&root->list_lock); 6306 spin_lock(&root->orphan_lock);
5302 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6307 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5303 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6308 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5304 inode->i_ino); 6309 inode->i_ino);
5305 list_del_init(&BTRFS_I(inode)->i_orphan); 6310 list_del_init(&BTRFS_I(inode)->i_orphan);
5306 } 6311 }
5307 spin_unlock(&root->list_lock); 6312 spin_unlock(&root->orphan_lock);
5308 6313
5309 while (1) { 6314 while (1) {
5310 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6315 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5425 if (S_ISDIR(old_inode->i_mode) && new_inode && 6430 if (S_ISDIR(old_inode->i_mode) && new_inode &&
5426 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6431 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5427 return -ENOTEMPTY; 6432 return -ENOTEMPTY;
5428
5429 /*
5430 * We want to reserve the absolute worst case amount of items. So if
5431 * both inodes are subvols and we need to unlink them then that would
5432 * require 4 item modifications, but if they are both normal inodes it
5433 * would require 5 item modifications, so we'll assume their normal
5434 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5435 * should cover the worst case number of items we'll modify.
5436 */
5437 ret = btrfs_reserve_metadata_space(root, 11);
5438 if (ret)
5439 return ret;
5440
5441 /* 6433 /*
5442 * we're using rename to replace one file with another. 6434 * we're using rename to replace one file with another.
5443 * and the replacement file is large. Start IO on it now so 6435 * and the replacement file is large. Start IO on it now so
@@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5450 /* close the racy window with snapshot create/destroy ioctl */ 6442 /* close the racy window with snapshot create/destroy ioctl */
5451 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6443 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5452 down_read(&root->fs_info->subvol_sem); 6444 down_read(&root->fs_info->subvol_sem);
6445 /*
6446 * We want to reserve the absolute worst case amount of items. So if
6447 * both inodes are subvols and we need to unlink them then that would
6448 * require 4 item modifications, but if they are both normal inodes it
6449 * would require 5 item modifications, so we'll assume their normal
6450 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6451 * should cover the worst case number of items we'll modify.
6452 */
6453 trans = btrfs_start_transaction(root, 20);
6454 if (IS_ERR(trans))
6455 return PTR_ERR(trans);
5453 6456
5454 trans = btrfs_start_transaction(root, 1);
5455 btrfs_set_trans_block_group(trans, new_dir); 6457 btrfs_set_trans_block_group(trans, new_dir);
5456 6458
5457 if (dest != root) 6459 if (dest != root)
@@ -5550,7 +6552,6 @@ out_fail:
5550 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6552 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5551 up_read(&root->fs_info->subvol_sem); 6553 up_read(&root->fs_info->subvol_sem);
5552 6554
5553 btrfs_unreserve_metadata_space(root, 11);
5554 return ret; 6555 return ret;
5555} 6556}
5556 6557
@@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5602 return 0; 6603 return 0;
5603} 6604}
5604 6605
6606int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6607{
6608 struct btrfs_inode *binode;
6609 struct inode *inode = NULL;
6610
6611 spin_lock(&root->fs_info->delalloc_lock);
6612 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6613 binode = list_entry(root->fs_info->delalloc_inodes.next,
6614 struct btrfs_inode, delalloc_inodes);
6615 inode = igrab(&binode->vfs_inode);
6616 if (inode) {
6617 list_move_tail(&binode->delalloc_inodes,
6618 &root->fs_info->delalloc_inodes);
6619 break;
6620 }
6621
6622 list_del_init(&binode->delalloc_inodes);
6623 cond_resched_lock(&root->fs_info->delalloc_lock);
6624 }
6625 spin_unlock(&root->fs_info->delalloc_lock);
6626
6627 if (inode) {
6628 write_inode_now(inode, 0);
6629 if (delay_iput)
6630 btrfs_add_delayed_iput(inode);
6631 else
6632 iput(inode);
6633 return 1;
6634 }
6635 return 0;
6636}
6637
5605static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6638static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5606 const char *symname) 6639 const char *symname)
5607{ 6640{
@@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5625 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6658 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5626 return -ENAMETOOLONG; 6659 return -ENAMETOOLONG;
5627 6660
6661 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6662 if (err)
6663 return err;
5628 /* 6664 /*
5629 * 2 items for inode item and ref 6665 * 2 items for inode item and ref
5630 * 2 items for dir items 6666 * 2 items for dir items
5631 * 1 item for xattr if selinux is on 6667 * 1 item for xattr if selinux is on
5632 */ 6668 */
5633 err = btrfs_reserve_metadata_space(root, 5); 6669 trans = btrfs_start_transaction(root, 5);
5634 if (err) 6670 if (IS_ERR(trans))
5635 return err; 6671 return PTR_ERR(trans);
5636 6672
5637 trans = btrfs_start_transaction(root, 1);
5638 if (!trans)
5639 goto out_fail;
5640 btrfs_set_trans_block_group(trans, dir); 6673 btrfs_set_trans_block_group(trans, dir);
5641 6674
5642 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5643 if (err) {
5644 err = -ENOSPC;
5645 goto out_unlock;
5646 }
5647
5648 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6675 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5649 dentry->d_name.len, 6676 dentry->d_name.len,
5650 dentry->d_parent->d_inode->i_ino, objectid, 6677 dentry->d_parent->d_inode->i_ino, objectid,
@@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5716out_unlock: 6743out_unlock:
5717 nr = trans->blocks_used; 6744 nr = trans->blocks_used;
5718 btrfs_end_transaction_throttle(trans, root); 6745 btrfs_end_transaction_throttle(trans, root);
5719out_fail:
5720 btrfs_unreserve_metadata_space(root, 5);
5721 if (drop_inode) { 6746 if (drop_inode) {
5722 inode_dec_link_count(inode); 6747 inode_dec_link_count(inode);
5723 iput(inode); 6748 iput(inode);
@@ -5726,33 +6751,28 @@ out_fail:
5726 return err; 6751 return err;
5727} 6752}
5728 6753
5729static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 6754int btrfs_prealloc_file_range(struct inode *inode, int mode,
5730 u64 alloc_hint, int mode, loff_t actual_len) 6755 u64 start, u64 num_bytes, u64 min_size,
6756 loff_t actual_len, u64 *alloc_hint)
5731{ 6757{
5732 struct btrfs_trans_handle *trans; 6758 struct btrfs_trans_handle *trans;
5733 struct btrfs_root *root = BTRFS_I(inode)->root; 6759 struct btrfs_root *root = BTRFS_I(inode)->root;
5734 struct btrfs_key ins; 6760 struct btrfs_key ins;
5735 u64 cur_offset = start; 6761 u64 cur_offset = start;
5736 u64 num_bytes = end - start;
5737 int ret = 0; 6762 int ret = 0;
5738 u64 i_size;
5739 6763
5740 while (num_bytes > 0) { 6764 while (num_bytes > 0) {
5741 trans = btrfs_start_transaction(root, 1); 6765 trans = btrfs_start_transaction(root, 3);
5742 6766 if (IS_ERR(trans)) {
5743 ret = btrfs_reserve_extent(trans, root, num_bytes, 6767 ret = PTR_ERR(trans);
5744 root->sectorsize, 0, alloc_hint, 6768 break;
5745 (u64)-1, &ins, 1);
5746 if (ret) {
5747 WARN_ON(1);
5748 goto stop_trans;
5749 } 6769 }
5750 6770
5751 ret = btrfs_reserve_metadata_space(root, 3); 6771 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6772 0, *alloc_hint, (u64)-1, &ins, 1);
5752 if (ret) { 6773 if (ret) {
5753 btrfs_free_reserved_extent(root, ins.objectid, 6774 btrfs_end_transaction(trans, root);
5754 ins.offset); 6775 break;
5755 goto stop_trans;
5756 } 6776 }
5757 6777
5758 ret = insert_reserved_file_extent(trans, inode, 6778 ret = insert_reserved_file_extent(trans, inode,
@@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5766 6786
5767 num_bytes -= ins.offset; 6787 num_bytes -= ins.offset;
5768 cur_offset += ins.offset; 6788 cur_offset += ins.offset;
5769 alloc_hint = ins.objectid + ins.offset; 6789 *alloc_hint = ins.objectid + ins.offset;
5770 6790
5771 inode->i_ctime = CURRENT_TIME; 6791 inode->i_ctime = CURRENT_TIME;
5772 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 6792 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5773 if (!(mode & FALLOC_FL_KEEP_SIZE) && 6793 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5774 (actual_len > inode->i_size) && 6794 (actual_len > inode->i_size) &&
5775 (cur_offset > inode->i_size)) { 6795 (cur_offset > inode->i_size)) {
5776
5777 if (cur_offset > actual_len) 6796 if (cur_offset > actual_len)
5778 i_size = actual_len; 6797 i_size_write(inode, actual_len);
5779 else 6798 else
5780 i_size = cur_offset; 6799 i_size_write(inode, cur_offset);
5781 i_size_write(inode, i_size); 6800 i_size_write(inode, cur_offset);
5782 btrfs_ordered_update_i_size(inode, i_size, NULL); 6801 btrfs_ordered_update_i_size(inode, cur_offset, NULL);
5783 } 6802 }
5784 6803
5785 ret = btrfs_update_inode(trans, root, inode); 6804 ret = btrfs_update_inode(trans, root, inode);
5786 BUG_ON(ret); 6805 BUG_ON(ret);
5787 6806
5788 btrfs_end_transaction(trans, root); 6807 btrfs_end_transaction(trans, root);
5789 btrfs_unreserve_metadata_space(root, 3);
5790 } 6808 }
5791 return ret; 6809 return ret;
5792
5793stop_trans:
5794 btrfs_end_transaction(trans, root);
5795 return ret;
5796
5797} 6810}
5798 6811
5799static long btrfs_fallocate(struct inode *inode, int mode, 6812static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5826 goto out; 6839 goto out;
5827 } 6840 }
5828 6841
5829 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, 6842 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
5830 alloc_end - alloc_start);
5831 if (ret) 6843 if (ret)
5832 goto out; 6844 goto out;
5833 6845
@@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5872 if (em->block_start == EXTENT_MAP_HOLE || 6884 if (em->block_start == EXTENT_MAP_HOLE ||
5873 (cur_offset >= inode->i_size && 6885 (cur_offset >= inode->i_size &&
5874 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6886 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5875 ret = prealloc_file_range(inode, 6887 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
5876 cur_offset, last_byte, 6888 last_byte - cur_offset,
5877 alloc_hint, mode, offset+len); 6889 1 << inode->i_blkbits,
6890 offset + len,
6891 &alloc_hint);
5878 if (ret < 0) { 6892 if (ret < 0) {
5879 free_extent_map(em); 6893 free_extent_map(em);
5880 break; 6894 break;
5881 } 6895 }
5882 } 6896 }
5883 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5884 alloc_hint = em->block_start;
5885 free_extent_map(em); 6897 free_extent_map(em);
5886 6898
5887 cur_offset = last_byte; 6899 cur_offset = last_byte;
@@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5893 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 6905 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5894 &cached_state, GFP_NOFS); 6906 &cached_state, GFP_NOFS);
5895 6907
5896 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 6908 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
5897 alloc_end - alloc_start);
5898out: 6909out:
5899 mutex_unlock(&inode->i_mutex); 6910 mutex_unlock(&inode->i_mutex);
5900 return ret; 6911 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..4dbaf89b1337 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
240 u64 index = 0; 240 u64 index = 0;
241 241
242 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
243 0, &objectid);
244 if (ret)
245 return ret;
242 /* 246 /*
243 * 1 - inode item 247 * 1 - inode item
244 * 2 - refs 248 * 2 - refs
245 * 1 - root item 249 * 1 - root item
246 * 2 - dir items 250 * 2 - dir items
247 */ 251 */
248 ret = btrfs_reserve_metadata_space(root, 6); 252 trans = btrfs_start_transaction(root, 6);
249 if (ret) 253 if (IS_ERR(trans))
250 return ret; 254 return PTR_ERR(trans);
251
252 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(!trans);
254
255 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
256 0, &objectid);
257 if (ret)
258 goto fail;
259 255
260 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 256 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
261 0, objectid, NULL, 0, 0, 0); 257 0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
345 err = btrfs_commit_transaction(trans, root); 341 err = btrfs_commit_transaction(trans, root);
346 if (err && !ret) 342 if (err && !ret)
347 ret = err; 343 ret = err;
348
349 btrfs_unreserve_metadata_space(root, 6);
350 return ret; 344 return ret;
351} 345}
352 346
353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
354 char *name, int namelen)
355{ 348{
356 struct inode *inode; 349 struct inode *inode;
357 struct btrfs_pending_snapshot *pending_snapshot; 350 struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
361 if (!root->ref_cows) 354 if (!root->ref_cows)
362 return -EINVAL; 355 return -EINVAL;
363 356
364 /*
365 * 1 - inode item
366 * 2 - refs
367 * 1 - root item
368 * 2 - dir items
369 */
370 ret = btrfs_reserve_metadata_space(root, 6);
371 if (ret)
372 goto fail;
373
374 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 357 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
375 if (!pending_snapshot) { 358 if (!pending_snapshot)
376 ret = -ENOMEM; 359 return -ENOMEM;
377 btrfs_unreserve_metadata_space(root, 6); 360
378 goto fail; 361 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 }
380 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
381 if (!pending_snapshot->name) {
382 ret = -ENOMEM;
383 kfree(pending_snapshot);
384 btrfs_unreserve_metadata_space(root, 6);
385 goto fail;
386 }
387 memcpy(pending_snapshot->name, name, namelen);
388 pending_snapshot->name[namelen] = '\0';
389 pending_snapshot->dentry = dentry; 362 pending_snapshot->dentry = dentry;
390 trans = btrfs_start_transaction(root, 1);
391 BUG_ON(!trans);
392 pending_snapshot->root = root; 363 pending_snapshot->root = root;
364
365 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
366 if (IS_ERR(trans)) {
367 ret = PTR_ERR(trans);
368 goto fail;
369 }
370
371 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
372 BUG_ON(ret);
373
393 list_add(&pending_snapshot->list, 374 list_add(&pending_snapshot->list,
394 &trans->transaction->pending_snapshots); 375 &trans->transaction->pending_snapshots);
395 ret = btrfs_commit_transaction(trans, root); 376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
396 BUG_ON(ret); 377 BUG_ON(ret);
397 btrfs_unreserve_metadata_space(root, 6); 378
379 ret = pending_snapshot->error;
380 if (ret)
381 goto fail;
382
383 btrfs_orphan_cleanup(pending_snapshot->snap);
398 384
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 385 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
400 if (IS_ERR(inode)) { 386 if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
405 d_instantiate(dentry, inode); 391 d_instantiate(dentry, inode);
406 ret = 0; 392 ret = 0;
407fail: 393fail:
394 kfree(pending_snapshot);
408 return ret; 395 return ret;
409} 396}
410 397
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
456 goto out_up_read; 443 goto out_up_read;
457 444
458 if (snap_src) { 445 if (snap_src) {
459 error = create_snapshot(snap_src, dentry, 446 error = create_snapshot(snap_src, dentry);
460 name, namelen);
461 } else { 447 } else {
462 error = create_subvol(BTRFS_I(dir)->root, dentry, 448 error = create_subvol(BTRFS_I(dir)->root, dentry,
463 name, namelen); 449 name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 587 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1; 588 BTRFS_I(inode)->force_compress = 1;
603 589
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 590 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
605 if (ret) { 591 if (ret)
606 ret = -ENOSPC; 592 goto err_unlock;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
617again: 593again:
618 if (inode->i_size == 0 || 594 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { 595 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
622 } 598 }
623 599
624 page = grab_cache_page(inode->i_mapping, i); 600 page = grab_cache_page(inode->i_mapping, i);
625 if (!page) 601 if (!page) {
602 ret = -ENOMEM;
626 goto err_reservations; 603 goto err_reservations;
604 }
627 605
628 if (!PageUptodate(page)) { 606 if (!PageUptodate(page)) {
629 btrfs_readpage(NULL, page); 607 btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
631 if (!PageUptodate(page)) { 609 if (!PageUptodate(page)) {
632 unlock_page(page); 610 unlock_page(page);
633 page_cache_release(page); 611 page_cache_release(page);
612 ret = -EIO;
634 goto err_reservations; 613 goto err_reservations;
635 } 614 }
636 } 615 }
@@ -644,8 +623,7 @@ again:
644 wait_on_page_writeback(page); 623 wait_on_page_writeback(page);
645 624
646 if (PageDirty(page)) { 625 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode, 626 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
648 PAGE_CACHE_SIZE);
649 goto loop_unlock; 627 goto loop_unlock;
650 } 628 }
651 629
@@ -683,7 +661,6 @@ loop_unlock:
683 page_cache_release(page); 661 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex); 662 mutex_unlock(&inode->i_mutex);
685 663
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 664 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++; 665 i++;
689 } 666 }
@@ -713,9 +690,9 @@ loop_unlock:
713 return 0; 690 return 0;
714 691
715err_reservations: 692err_reservations:
693 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
694err_unlock:
716 mutex_unlock(&inode->i_mutex); 695 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret; 696 return ret;
720} 697}
721 698
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
811 device->name, (unsigned long long)new_size); 788 device->name, (unsigned long long)new_size);
812 789
813 if (new_size > old_size) { 790 if (new_size > old_size) {
814 trans = btrfs_start_transaction(root, 1); 791 trans = btrfs_start_transaction(root, 0);
815 ret = btrfs_grow_device(trans, device, new_size); 792 ret = btrfs_grow_device(trans, device, new_size);
816 btrfs_commit_transaction(trans, root); 793 btrfs_commit_transaction(trans, root);
817 } else { 794 } else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1300 if (err) 1277 if (err)
1301 goto out_up_write; 1278 goto out_up_write;
1302 1279
1303 trans = btrfs_start_transaction(root, 1); 1280 trans = btrfs_start_transaction(root, 0);
1281 if (IS_ERR(trans)) {
1282 err = PTR_ERR(trans);
1283 goto out_up_write;
1284 }
1285 trans->block_rsv = &root->fs_info->global_block_rsv;
1286
1304 ret = btrfs_unlink_subvol(trans, root, dir, 1287 ret = btrfs_unlink_subvol(trans, root, dir,
1305 dest->root_key.objectid, 1288 dest->root_key.objectid,
1306 dentry->d_name.name, 1289 dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1314 dest->root_item.drop_level = 0; 1297 dest->root_item.drop_level = 0;
1315 btrfs_set_root_refs(&dest->root_item, 0); 1298 btrfs_set_root_refs(&dest->root_item, 0);
1316 1299
1317 ret = btrfs_insert_orphan_item(trans, 1300 if (!xchg(&dest->orphan_item_inserted, 1)) {
1318 root->fs_info->tree_root, 1301 ret = btrfs_insert_orphan_item(trans,
1319 dest->root_key.objectid); 1302 root->fs_info->tree_root,
1320 BUG_ON(ret); 1303 dest->root_key.objectid);
1304 BUG_ON(ret);
1305 }
1321 1306
1322 ret = btrfs_commit_transaction(trans, root); 1307 ret = btrfs_commit_transaction(trans, root);
1323 BUG_ON(ret); 1308 BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1358 ret = -EPERM; 1343 ret = -EPERM;
1359 goto out; 1344 goto out;
1360 } 1345 }
1361 btrfs_defrag_root(root, 0); 1346 ret = btrfs_defrag_root(root, 0);
1362 btrfs_defrag_root(root->fs_info->extent_root, 0); 1347 if (ret)
1348 goto out;
1349 ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
1363 break; 1350 break;
1364 case S_IFREG: 1351 case S_IFREG:
1365 if (!(file->f_mode & FMODE_WRITE)) { 1352 if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1389 /* the rest are all set to zero by kzalloc */ 1376 /* the rest are all set to zero by kzalloc */
1390 range->len = (u64)-1; 1377 range->len = (u64)-1;
1391 } 1378 }
1392 btrfs_defrag_file(file, range); 1379 ret = btrfs_defrag_file(file, range);
1393 kfree(range); 1380 kfree(range);
1394 break; 1381 break;
1382 default:
1383 ret = -EINVAL;
1395 } 1384 }
1396out: 1385out:
1397 mnt_drop_write(file->f_path.mnt); 1386 mnt_drop_write(file->f_path.mnt);
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1550 btrfs_wait_ordered_range(src, off, off+len); 1539 btrfs_wait_ordered_range(src, off, off+len);
1551 } 1540 }
1552 1541
1553 trans = btrfs_start_transaction(root, 1);
1554 BUG_ON(!trans);
1555
1556 /* punch hole in destination first */
1557 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1558
1559 /* clone data */ 1542 /* clone data */
1560 key.objectid = src->i_ino; 1543 key.objectid = src->i_ino;
1561 key.type = BTRFS_EXTENT_DATA_KEY; 1544 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1566 * note the key will change type as we walk through the 1549 * note the key will change type as we walk through the
1567 * tree. 1550 * tree.
1568 */ 1551 */
1569 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 1552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1570 if (ret < 0) 1553 if (ret < 0)
1571 goto out; 1554 goto out;
1572 1555
@@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1629 new_key.objectid = inode->i_ino; 1612 new_key.objectid = inode->i_ino;
1630 new_key.offset = key.offset + destoff - off; 1613 new_key.offset = key.offset + destoff - off;
1631 1614
1615 trans = btrfs_start_transaction(root, 1);
1616 if (IS_ERR(trans)) {
1617 ret = PTR_ERR(trans);
1618 goto out;
1619 }
1620
1632 if (type == BTRFS_FILE_EXTENT_REG || 1621 if (type == BTRFS_FILE_EXTENT_REG ||
1633 type == BTRFS_FILE_EXTENT_PREALLOC) { 1622 type == BTRFS_FILE_EXTENT_PREALLOC) {
1623 if (off > key.offset) {
1624 datao += off - key.offset;
1625 datal -= off - key.offset;
1626 }
1627
1628 if (key.offset + datal > off + len)
1629 datal = off + len - key.offset;
1630
1631 ret = btrfs_drop_extents(trans, inode,
1632 new_key.offset,
1633 new_key.offset + datal,
1634 &hint_byte, 1);
1635 BUG_ON(ret);
1636
1634 ret = btrfs_insert_empty_item(trans, root, path, 1637 ret = btrfs_insert_empty_item(trans, root, path,
1635 &new_key, size); 1638 &new_key, size);
1636 if (ret) 1639 BUG_ON(ret);
1637 goto out;
1638 1640
1639 leaf = path->nodes[0]; 1641 leaf = path->nodes[0];
1640 slot = path->slots[0]; 1642 slot = path->slots[0];
@@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1645 extent = btrfs_item_ptr(leaf, slot, 1647 extent = btrfs_item_ptr(leaf, slot,
1646 struct btrfs_file_extent_item); 1648 struct btrfs_file_extent_item);
1647 1649
1648 if (off > key.offset) {
1649 datao += off - key.offset;
1650 datal -= off - key.offset;
1651 }
1652
1653 if (key.offset + datal > off + len)
1654 datal = off + len - key.offset;
1655
1656 /* disko == 0 means it's a hole */ 1650 /* disko == 0 means it's a hole */
1657 if (!disko) 1651 if (!disko)
1658 datao = 0; 1652 datao = 0;
@@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1683 1677
1684 if (comp && (skip || trim)) { 1678 if (comp && (skip || trim)) {
1685 ret = -EINVAL; 1679 ret = -EINVAL;
1680 btrfs_end_transaction(trans, root);
1686 goto out; 1681 goto out;
1687 } 1682 }
1688 size -= skip + trim; 1683 size -= skip + trim;
1689 datal -= skip + trim; 1684 datal -= skip + trim;
1685
1686 ret = btrfs_drop_extents(trans, inode,
1687 new_key.offset,
1688 new_key.offset + datal,
1689 &hint_byte, 1);
1690 BUG_ON(ret);
1691
1690 ret = btrfs_insert_empty_item(trans, root, path, 1692 ret = btrfs_insert_empty_item(trans, root, path,
1691 &new_key, size); 1693 &new_key, size);
1692 if (ret) 1694 BUG_ON(ret);
1693 goto out;
1694 1695
1695 if (skip) { 1696 if (skip) {
1696 u32 start = 1697 u32 start =
@@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1708 } 1709 }
1709 1710
1710 btrfs_mark_buffer_dirty(leaf); 1711 btrfs_mark_buffer_dirty(leaf);
1711 } 1712 btrfs_release_path(root, path);
1712 1713
1714 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1715 if (new_key.offset + datal > inode->i_size)
1716 btrfs_i_size_write(inode,
1717 new_key.offset + datal);
1718 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1719 ret = btrfs_update_inode(trans, root, inode);
1720 BUG_ON(ret);
1721 btrfs_end_transaction(trans, root);
1722 }
1713next: 1723next:
1714 btrfs_release_path(root, path); 1724 btrfs_release_path(root, path);
1715 key.offset++; 1725 key.offset++;
@@ -1717,17 +1727,7 @@ next:
1717 ret = 0; 1727 ret = 0;
1718out: 1728out:
1719 btrfs_release_path(root, path); 1729 btrfs_release_path(root, path);
1720 if (ret == 0) {
1721 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1722 if (destoff + olen > inode->i_size)
1723 btrfs_i_size_write(inode, destoff + olen);
1724 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1725 ret = btrfs_update_inode(trans, root, inode);
1726 }
1727 btrfs_end_transaction(trans, root);
1728 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1730 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1729 if (ret)
1730 vmtruncate(inode, 0);
1731out_unlock: 1731out_unlock:
1732 mutex_unlock(&src->i_mutex); 1732 mutex_unlock(&src->i_mutex);
1733 mutex_unlock(&inode->i_mutex); 1733 mutex_unlock(&inode->i_mutex);
@@ -1845,7 +1845,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1845 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 1845 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1846 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 1846 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1847 dir_id, "default", 7, 1); 1847 dir_id, "default", 7, 1);
1848 if (!di) { 1848 if (IS_ERR_OR_NULL(di)) {
1849 btrfs_free_path(path); 1849 btrfs_free_path(path);
1850 btrfs_end_transaction(trans, root); 1850 btrfs_end_transaction(trans, root);
1851 printk(KERN_ERR "Umm, you don't have the default dir item, " 1851 printk(KERN_ERR "Umm, you don't have the default dir item, "
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
124 return 1; 124 return 1;
125} 125}
126 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
127/* 136/*
128 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
129 * the first one less than this offset 138 * the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
161 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
162 * inserted. 171 * inserted.
163 */ 172 */
164int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
165 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio)
166{ 176{
167 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
168 struct rb_node *node; 178 struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
182 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
183 set_bit(type, &entry->flags); 193 set_bit(type, &entry->flags);
184 194
195 if (dio)
196 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
197
185 /* one ref for the tree */ 198 /* one ref for the tree */
186 atomic_set(&entry->refs, 1); 199 atomic_set(&entry->refs, 1);
187 init_waitqueue_head(&entry->wait); 200 init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
203 return 0; 216 return 0;
204} 217}
205 218
219int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type)
221{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0);
224}
225
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type)
228{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1);
231}
232
206/* 233/*
207 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 234 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
208 * when an ordered extent is finished. If the list covers more than one 235 * when an ordered extent is finished. If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
311 tree->last = NULL; 338 tree->last = NULL;
312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 339 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
313 340
314 spin_lock(&BTRFS_I(inode)->accounting_lock);
315 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
316 BTRFS_I(inode)->outstanding_extents--;
317 spin_unlock(&BTRFS_I(inode)->accounting_lock);
318 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
319 inode, 1);
320
321 spin_lock(&root->fs_info->ordered_extent_lock); 341 spin_lock(&root->fs_info->ordered_extent_lock);
322 list_del_init(&entry->root_extent_list); 342 list_del_init(&entry->root_extent_list);
323 343
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
491 * start IO on any dirty ones so the wait doesn't stall waiting 511 * start IO on any dirty ones so the wait doesn't stall waiting
492 * for pdflush to find them 512 * for pdflush to find them
493 */ 513 */
494 filemap_fdatawrite_range(inode->i_mapping, start, end); 514 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
515 filemap_fdatawrite_range(inode->i_mapping, start, end);
495 if (wait) { 516 if (wait) {
496 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 517 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
497 &entry->flags)); 518 &entry->flags));
@@ -588,6 +609,47 @@ out:
588 return entry; 609 return entry;
589} 610}
590 611
612/* Since the DIO code tries to lock a wide area we need to look for any ordered
613 * extents that exist in the range, rather than just the start of the range.
614 */
615struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
616 u64 file_offset,
617 u64 len)
618{
619 struct btrfs_ordered_inode_tree *tree;
620 struct rb_node *node;
621 struct btrfs_ordered_extent *entry = NULL;
622
623 tree = &BTRFS_I(inode)->ordered_tree;
624 spin_lock(&tree->lock);
625 node = tree_search(tree, file_offset);
626 if (!node) {
627 node = tree_search(tree, file_offset + len);
628 if (!node)
629 goto out;
630 }
631
632 while (1) {
633 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
634 if (range_overlaps(entry, file_offset, len))
635 break;
636
637 if (entry->file_offset >= file_offset + len) {
638 entry = NULL;
639 break;
640 }
641 entry = NULL;
642 node = rb_next(node);
643 if (!node)
644 break;
645 }
646out:
647 if (entry)
648 atomic_inc(&entry->refs);
649 spin_unlock(&tree->lock);
650 return entry;
651}
652
591/* 653/*
592 * lookup and return any extent before 'file_offset'. NULL is returned 654 * lookup and return any extent before 'file_offset'. NULL is returned
593 * if none is found 655 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
140 struct btrfs_ordered_extent **cached, 142 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size); 143 u64 file_offset, u64 io_size);
142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
143 u64 start, u64 len, u64 disk_len, int tyep); 145 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type);
144int btrfs_add_ordered_sum(struct inode *inode, 148int btrfs_add_ordered_sum(struct inode *inode,
145 struct btrfs_ordered_extent *entry, 149 struct btrfs_ordered_extent *entry,
146 struct btrfs_ordered_sum *sum); 150 struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
151int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 155int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
152struct btrfs_ordered_extent * 156struct btrfs_ordered_extent *
153btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 157btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
158struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
159 u64 file_offset,
160 u64 len);
154int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 161int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
155 struct btrfs_ordered_extent *ordered); 162 struct btrfs_ordered_extent *ordered);
156int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 163int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
44struct backref_node { 44struct backref_node {
45 struct rb_node rb_node; 45 struct rb_node rb_node;
46 u64 bytenr; 46 u64 bytenr;
47 /* objectid tree block owner */ 47
48 u64 new_bytenr;
49 /* objectid of tree block owner, can be not uptodate */
48 u64 owner; 50 u64 owner;
51 /* link to pending, changed or detached list */
52 struct list_head list;
49 /* list of upper level blocks reference this block */ 53 /* list of upper level blocks reference this block */
50 struct list_head upper; 54 struct list_head upper;
51 /* list of child blocks in the cache */ 55 /* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
56 struct extent_buffer *eb; 60 struct extent_buffer *eb;
57 /* level of tree block */ 61 /* level of tree block */
58 unsigned int level:8; 62 unsigned int level:8;
59 /* 1 if the block is root of old snapshot */ 63 /* is the block in non-reference counted tree */
60 unsigned int old_root:1; 64 unsigned int cowonly:1;
61 /* 1 if no child blocks in the cache */ 65 /* 1 if no child node in the cache */
62 unsigned int lowest:1; 66 unsigned int lowest:1;
63 /* is the extent buffer locked */ 67 /* is the extent buffer locked */
64 unsigned int locked:1; 68 unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
66 unsigned int processed:1; 70 unsigned int processed:1;
67 /* have backrefs of this block been checked */ 71 /* have backrefs of this block been checked */
68 unsigned int checked:1; 72 unsigned int checked:1;
73 /*
74 * 1 if corresponding block has been cowed but some upper
75 * level block pointers may not point to the new location
76 */
77 unsigned int pending:1;
78 /*
79 * 1 if the backref node isn't connected to any other
80 * backref node.
81 */
82 unsigned int detached:1;
69}; 83};
70 84
71/* 85/*
@@ -74,7 +88,6 @@ struct backref_node {
74struct backref_edge { 88struct backref_edge {
75 struct list_head list[2]; 89 struct list_head list[2];
76 struct backref_node *node[2]; 90 struct backref_node *node[2];
77 u64 blockptr;
78}; 91};
79 92
80#define LOWER 0 93#define LOWER 0
@@ -83,9 +96,25 @@ struct backref_edge {
83struct backref_cache { 96struct backref_cache {
84 /* red black tree of all backref nodes in the cache */ 97 /* red black tree of all backref nodes in the cache */
85 struct rb_root rb_root; 98 struct rb_root rb_root;
86 /* list of backref nodes with no child block in the cache */ 99 /* for passing backref nodes to btrfs_reloc_cow_block */
100 struct backref_node *path[BTRFS_MAX_LEVEL];
101 /*
102 * list of blocks that have been cowed but some block
103 * pointers in upper level blocks may not reflect the
104 * new location
105 */
87 struct list_head pending[BTRFS_MAX_LEVEL]; 106 struct list_head pending[BTRFS_MAX_LEVEL];
88 spinlock_t lock; 107 /* list of backref nodes with no child node */
108 struct list_head leaves;
109 /* list of blocks that have been cowed in current transaction */
110 struct list_head changed;
111 /* list of detached backref node. */
112 struct list_head detached;
113
114 u64 last_trans;
115
116 int nr_nodes;
117 int nr_edges;
89}; 118};
90 119
91/* 120/*
@@ -113,15 +142,6 @@ struct tree_block {
113 unsigned int key_ready:1; 142 unsigned int key_ready:1;
114}; 143};
115 144
116/* inode vector */
117#define INODEVEC_SIZE 16
118
119struct inodevec {
120 struct list_head list;
121 struct inode *inode[INODEVEC_SIZE];
122 int nr;
123};
124
125#define MAX_EXTENTS 128 145#define MAX_EXTENTS 128
126 146
127struct file_extent_cluster { 147struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
138 struct btrfs_root *extent_root; 158 struct btrfs_root *extent_root;
139 /* inode for moving data */ 159 /* inode for moving data */
140 struct inode *data_inode; 160 struct inode *data_inode;
141 struct btrfs_workers workers; 161
162 struct btrfs_block_rsv *block_rsv;
163
164 struct backref_cache backref_cache;
165
166 struct file_extent_cluster cluster;
142 /* tree blocks have been processed */ 167 /* tree blocks have been processed */
143 struct extent_io_tree processed_blocks; 168 struct extent_io_tree processed_blocks;
144 /* map start of tree root to corresponding reloc tree */ 169 /* map start of tree root to corresponding reloc tree */
145 struct mapping_tree reloc_root_tree; 170 struct mapping_tree reloc_root_tree;
146 /* list of reloc trees */ 171 /* list of reloc trees */
147 struct list_head reloc_roots; 172 struct list_head reloc_roots;
173 /* size of metadata reservation for merging reloc trees */
174 u64 merging_rsv_size;
175 /* size of relocated tree nodes */
176 u64 nodes_relocated;
177
148 u64 search_start; 178 u64 search_start;
149 u64 extents_found; 179 u64 extents_found;
150 u64 extents_skipped; 180
151 int stage; 181 int block_rsv_retries;
152 int create_reloc_root; 182
183 unsigned int stage:8;
184 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1;
153 unsigned int found_file_extent:1; 186 unsigned int found_file_extent:1;
154 unsigned int found_old_snapshot:1; 187 unsigned int commit_transaction:1;
155}; 188};
156 189
157/* stages of data relocation */ 190/* stages of data relocation */
158#define MOVE_DATA_EXTENTS 0 191#define MOVE_DATA_EXTENTS 0
159#define UPDATE_DATA_PTRS 1 192#define UPDATE_DATA_PTRS 1
160 193
161/* 194static void remove_backref_node(struct backref_cache *cache,
162 * merge reloc tree to corresponding fs tree in worker threads 195 struct backref_node *node);
163 */ 196static void __mark_block_processed(struct reloc_control *rc,
164struct async_merge { 197 struct backref_node *node);
165 struct btrfs_work work;
166 struct reloc_control *rc;
167 struct btrfs_root *root;
168 struct completion *done;
169 atomic_t *num_pending;
170};
171 198
172static void mapping_tree_init(struct mapping_tree *tree) 199static void mapping_tree_init(struct mapping_tree *tree)
173{ 200{
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
181 cache->rb_root = RB_ROOT; 208 cache->rb_root = RB_ROOT;
182 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 209 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
183 INIT_LIST_HEAD(&cache->pending[i]); 210 INIT_LIST_HEAD(&cache->pending[i]);
184 spin_lock_init(&cache->lock); 211 INIT_LIST_HEAD(&cache->changed);
212 INIT_LIST_HEAD(&cache->detached);
213 INIT_LIST_HEAD(&cache->leaves);
214}
215
216static void backref_cache_cleanup(struct backref_cache *cache)
217{
218 struct backref_node *node;
219 int i;
220
221 while (!list_empty(&cache->detached)) {
222 node = list_entry(cache->detached.next,
223 struct backref_node, list);
224 remove_backref_node(cache, node);
225 }
226
227 while (!list_empty(&cache->leaves)) {
228 node = list_entry(cache->leaves.next,
229 struct backref_node, lower);
230 remove_backref_node(cache, node);
231 }
232
233 cache->last_trans = 0;
234
235 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
236 BUG_ON(!list_empty(&cache->pending[i]));
237 BUG_ON(!list_empty(&cache->changed));
238 BUG_ON(!list_empty(&cache->detached));
239 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
240 BUG_ON(cache->nr_nodes);
241 BUG_ON(cache->nr_edges);
242}
243
244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
245{
246 struct backref_node *node;
247
248 node = kzalloc(sizeof(*node), GFP_NOFS);
249 if (node) {
250 INIT_LIST_HEAD(&node->list);
251 INIT_LIST_HEAD(&node->upper);
252 INIT_LIST_HEAD(&node->lower);
253 RB_CLEAR_NODE(&node->rb_node);
254 cache->nr_nodes++;
255 }
256 return node;
257}
258
259static void free_backref_node(struct backref_cache *cache,
260 struct backref_node *node)
261{
262 if (node) {
263 cache->nr_nodes--;
264 kfree(node);
265 }
266}
267
268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
269{
270 struct backref_edge *edge;
271
272 edge = kzalloc(sizeof(*edge), GFP_NOFS);
273 if (edge)
274 cache->nr_edges++;
275 return edge;
185} 276}
186 277
187static void backref_node_init(struct backref_node *node) 278static void free_backref_edge(struct backref_cache *cache,
279 struct backref_edge *edge)
188{ 280{
189 memset(node, 0, sizeof(*node)); 281 if (edge) {
190 INIT_LIST_HEAD(&node->upper); 282 cache->nr_edges--;
191 INIT_LIST_HEAD(&node->lower); 283 kfree(edge);
192 RB_CLEAR_NODE(&node->rb_node); 284 }
193} 285}
194 286
195static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
250 edges[idx++] = edge; 342 edges[idx++] = edge;
251 node = edge->node[UPPER]; 343 node = edge->node[UPPER];
252 } 344 }
345 BUG_ON(node->detached);
253 *index = idx; 346 *index = idx;
254 return node; 347 return node;
255} 348}
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
281 return NULL; 374 return NULL;
282} 375}
283 376
377static void unlock_node_buffer(struct backref_node *node)
378{
379 if (node->locked) {
380 btrfs_tree_unlock(node->eb);
381 node->locked = 0;
382 }
383}
384
284static void drop_node_buffer(struct backref_node *node) 385static void drop_node_buffer(struct backref_node *node)
285{ 386{
286 if (node->eb) { 387 if (node->eb) {
287 if (node->locked) { 388 unlock_node_buffer(node);
288 btrfs_tree_unlock(node->eb);
289 node->locked = 0;
290 }
291 free_extent_buffer(node->eb); 389 free_extent_buffer(node->eb);
292 node->eb = NULL; 390 node->eb = NULL;
293 } 391 }
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
296static void drop_backref_node(struct backref_cache *tree, 394static void drop_backref_node(struct backref_cache *tree,
297 struct backref_node *node) 395 struct backref_node *node)
298{ 396{
299 BUG_ON(!node->lowest);
300 BUG_ON(!list_empty(&node->upper)); 397 BUG_ON(!list_empty(&node->upper));
301 398
302 drop_node_buffer(node); 399 drop_node_buffer(node);
400 list_del(&node->list);
303 list_del(&node->lower); 401 list_del(&node->lower);
304 402 if (!RB_EMPTY_NODE(&node->rb_node))
305 rb_erase(&node->rb_node, &tree->rb_root); 403 rb_erase(&node->rb_node, &tree->rb_root);
306 kfree(node); 404 free_backref_node(tree, node);
307} 405}
308 406
309/* 407/*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
318 if (!node) 416 if (!node)
319 return; 417 return;
320 418
321 BUG_ON(!node->lowest); 419 BUG_ON(!node->lowest && !node->detached);
322 while (!list_empty(&node->upper)) { 420 while (!list_empty(&node->upper)) {
323 edge = list_entry(node->upper.next, struct backref_edge, 421 edge = list_entry(node->upper.next, struct backref_edge,
324 list[LOWER]); 422 list[LOWER]);
325 upper = edge->node[UPPER]; 423 upper = edge->node[UPPER];
326 list_del(&edge->list[LOWER]); 424 list_del(&edge->list[LOWER]);
327 list_del(&edge->list[UPPER]); 425 list_del(&edge->list[UPPER]);
328 kfree(edge); 426 free_backref_edge(cache, edge);
427
428 if (RB_EMPTY_NODE(&upper->rb_node)) {
429 BUG_ON(!list_empty(&node->upper));
430 drop_backref_node(cache, node);
431 node = upper;
432 node->lowest = 1;
433 continue;
434 }
329 /* 435 /*
330 * add the node to pending list if no other 436 * add the node to leaf node list if no other
331 * child block cached. 437 * child block cached.
332 */ 438 */
333 if (list_empty(&upper->lower)) { 439 if (list_empty(&upper->lower)) {
334 list_add_tail(&upper->lower, 440 list_add_tail(&upper->lower, &cache->leaves);
335 &cache->pending[upper->level]);
336 upper->lowest = 1; 441 upper->lowest = 1;
337 } 442 }
338 } 443 }
444
339 drop_backref_node(cache, node); 445 drop_backref_node(cache, node);
340} 446}
341 447
448static void update_backref_node(struct backref_cache *cache,
449 struct backref_node *node, u64 bytenr)
450{
451 struct rb_node *rb_node;
452 rb_erase(&node->rb_node, &cache->rb_root);
453 node->bytenr = bytenr;
454 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
455 BUG_ON(rb_node);
456}
457
458/*
459 * update backref cache after a transaction commit
460 */
461static int update_backref_cache(struct btrfs_trans_handle *trans,
462 struct backref_cache *cache)
463{
464 struct backref_node *node;
465 int level = 0;
466
467 if (cache->last_trans == 0) {
468 cache->last_trans = trans->transid;
469 return 0;
470 }
471
472 if (cache->last_trans == trans->transid)
473 return 0;
474
475 /*
476 * detached nodes are used to avoid unnecessary backref
477 * lookup. transaction commit changes the extent tree.
478 * so the detached nodes are no longer useful.
479 */
480 while (!list_empty(&cache->detached)) {
481 node = list_entry(cache->detached.next,
482 struct backref_node, list);
483 remove_backref_node(cache, node);
484 }
485
486 while (!list_empty(&cache->changed)) {
487 node = list_entry(cache->changed.next,
488 struct backref_node, list);
489 list_del_init(&node->list);
490 BUG_ON(node->pending);
491 update_backref_node(cache, node, node->new_bytenr);
492 }
493
494 /*
495 * some nodes can be left in the pending list if there were
496 * errors during processing the pending nodes.
497 */
498 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
499 list_for_each_entry(node, &cache->pending[level], list) {
500 BUG_ON(!node->pending);
501 if (node->bytenr == node->new_bytenr)
502 continue;
503 update_backref_node(cache, node, node->new_bytenr);
504 }
505 }
506
507 cache->last_trans = 0;
508 return 1;
509}
510
511static int should_ignore_root(struct btrfs_root *root)
512{
513 struct btrfs_root *reloc_root;
514
515 if (!root->ref_cows)
516 return 0;
517
518 reloc_root = root->reloc_root;
519 if (!reloc_root)
520 return 0;
521
522 if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
523 root->fs_info->running_transaction->transid - 1)
524 return 0;
525 /*
526 * if there is reloc tree and it was created in previous
527 * transaction backref lookup can find the reloc tree,
528 * so backref node for the fs tree root is useless for
529 * relocation.
530 */
531 return 1;
532}
533
342/* 534/*
343 * find reloc tree by address of tree root 535 * find reloc tree by address of tree root
344 */ 536 */
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
453 * for all upper level blocks that directly/indirectly reference the 645 * for all upper level blocks that directly/indirectly reference the
454 * block are also cached. 646 * block are also cached.
455 */ 647 */
456static struct backref_node *build_backref_tree(struct reloc_control *rc, 648static noinline_for_stack
457 struct backref_cache *cache, 649struct backref_node *build_backref_tree(struct reloc_control *rc,
458 struct btrfs_key *node_key, 650 struct btrfs_key *node_key,
459 int level, u64 bytenr) 651 int level, u64 bytenr)
460{ 652{
653 struct backref_cache *cache = &rc->backref_cache;
461 struct btrfs_path *path1; 654 struct btrfs_path *path1;
462 struct btrfs_path *path2; 655 struct btrfs_path *path2;
463 struct extent_buffer *eb; 656 struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
473 unsigned long end; 666 unsigned long end;
474 unsigned long ptr; 667 unsigned long ptr;
475 LIST_HEAD(list); 668 LIST_HEAD(list);
669 LIST_HEAD(useless);
670 int cowonly;
476 int ret; 671 int ret;
477 int err = 0; 672 int err = 0;
478 673
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
483 goto out; 678 goto out;
484 } 679 }
485 680
486 node = kmalloc(sizeof(*node), GFP_NOFS); 681 node = alloc_backref_node(cache);
487 if (!node) { 682 if (!node) {
488 err = -ENOMEM; 683 err = -ENOMEM;
489 goto out; 684 goto out;
490 } 685 }
491 686
492 backref_node_init(node);
493 node->bytenr = bytenr; 687 node->bytenr = bytenr;
494 node->owner = 0;
495 node->level = level; 688 node->level = level;
496 node->lowest = 1; 689 node->lowest = 1;
497 cur = node; 690 cur = node;
@@ -587,17 +780,21 @@ again:
587#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 780#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
588 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || 781 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
589 key.type == BTRFS_EXTENT_REF_V0_KEY) { 782 key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 if (key.objectid == key.offset && 783 if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 key.type == BTRFS_EXTENT_REF_V0_KEY) {
592 struct btrfs_extent_ref_v0 *ref0; 784 struct btrfs_extent_ref_v0 *ref0;
593 ref0 = btrfs_item_ptr(eb, path1->slots[0], 785 ref0 = btrfs_item_ptr(eb, path1->slots[0],
594 struct btrfs_extent_ref_v0); 786 struct btrfs_extent_ref_v0);
595 root = find_tree_root(rc, eb, ref0); 787 if (key.objectid == key.offset) {
596 if (root) 788 root = find_tree_root(rc, eb, ref0);
597 cur->root = root; 789 if (root && !should_ignore_root(root))
598 else 790 cur->root = root;
599 cur->old_root = 1; 791 else
600 break; 792 list_add(&cur->list, &useless);
793 break;
794 }
795 if (is_cowonly_root(btrfs_ref_root_v0(eb,
796 ref0)))
797 cur->cowonly = 1;
601 } 798 }
602#else 799#else
603 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 800 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +811,20 @@ again:
614 break; 811 break;
615 } 812 }
616 813
617 edge = kzalloc(sizeof(*edge), GFP_NOFS); 814 edge = alloc_backref_edge(cache);
618 if (!edge) { 815 if (!edge) {
619 err = -ENOMEM; 816 err = -ENOMEM;
620 goto out; 817 goto out;
621 } 818 }
622 rb_node = tree_search(&cache->rb_root, key.offset); 819 rb_node = tree_search(&cache->rb_root, key.offset);
623 if (!rb_node) { 820 if (!rb_node) {
624 upper = kmalloc(sizeof(*upper), GFP_NOFS); 821 upper = alloc_backref_node(cache);
625 if (!upper) { 822 if (!upper) {
626 kfree(edge); 823 free_backref_edge(cache, edge);
627 err = -ENOMEM; 824 err = -ENOMEM;
628 goto out; 825 goto out;
629 } 826 }
630 backref_node_init(upper);
631 upper->bytenr = key.offset; 827 upper->bytenr = key.offset;
632 upper->owner = 0;
633 upper->level = cur->level + 1; 828 upper->level = cur->level + 1;
634 /* 829 /*
635 * backrefs for the upper level block isn't 830 * backrefs for the upper level block isn't
@@ -639,11 +834,12 @@ again:
639 } else { 834 } else {
640 upper = rb_entry(rb_node, struct backref_node, 835 upper = rb_entry(rb_node, struct backref_node,
641 rb_node); 836 rb_node);
837 BUG_ON(!upper->checked);
642 INIT_LIST_HEAD(&edge->list[UPPER]); 838 INIT_LIST_HEAD(&edge->list[UPPER]);
643 } 839 }
644 list_add(&edge->list[LOWER], &cur->upper); 840 list_add_tail(&edge->list[LOWER], &cur->upper);
645 edge->node[UPPER] = upper;
646 edge->node[LOWER] = cur; 841 edge->node[LOWER] = cur;
842 edge->node[UPPER] = upper;
647 843
648 goto next; 844 goto next;
649 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 845 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +853,17 @@ again:
657 goto out; 853 goto out;
658 } 854 }
659 855
856 if (!root->ref_cows)
857 cur->cowonly = 1;
858
660 if (btrfs_root_level(&root->root_item) == cur->level) { 859 if (btrfs_root_level(&root->root_item) == cur->level) {
661 /* tree root */ 860 /* tree root */
662 BUG_ON(btrfs_root_bytenr(&root->root_item) != 861 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
663 cur->bytenr); 862 cur->bytenr);
664 cur->root = root; 863 if (should_ignore_root(root))
864 list_add(&cur->list, &useless);
865 else
866 cur->root = root;
665 break; 867 break;
666 } 868 }
667 869
@@ -692,11 +894,14 @@ again:
692 if (!path2->nodes[level]) { 894 if (!path2->nodes[level]) {
693 BUG_ON(btrfs_root_bytenr(&root->root_item) != 895 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
694 lower->bytenr); 896 lower->bytenr);
695 lower->root = root; 897 if (should_ignore_root(root))
898 list_add(&lower->list, &useless);
899 else
900 lower->root = root;
696 break; 901 break;
697 } 902 }
698 903
699 edge = kzalloc(sizeof(*edge), GFP_NOFS); 904 edge = alloc_backref_edge(cache);
700 if (!edge) { 905 if (!edge) {
701 err = -ENOMEM; 906 err = -ENOMEM;
702 goto out; 907 goto out;
@@ -705,16 +910,17 @@ again:
705 eb = path2->nodes[level]; 910 eb = path2->nodes[level];
706 rb_node = tree_search(&cache->rb_root, eb->start); 911 rb_node = tree_search(&cache->rb_root, eb->start);
707 if (!rb_node) { 912 if (!rb_node) {
708 upper = kmalloc(sizeof(*upper), GFP_NOFS); 913 upper = alloc_backref_node(cache);
709 if (!upper) { 914 if (!upper) {
710 kfree(edge); 915 free_backref_edge(cache, edge);
711 err = -ENOMEM; 916 err = -ENOMEM;
712 goto out; 917 goto out;
713 } 918 }
714 backref_node_init(upper);
715 upper->bytenr = eb->start; 919 upper->bytenr = eb->start;
716 upper->owner = btrfs_header_owner(eb); 920 upper->owner = btrfs_header_owner(eb);
717 upper->level = lower->level + 1; 921 upper->level = lower->level + 1;
922 if (!root->ref_cows)
923 upper->cowonly = 1;
718 924
719 /* 925 /*
720 * if we know the block isn't shared 926 * if we know the block isn't shared
@@ -744,10 +950,12 @@ again:
744 rb_node); 950 rb_node);
745 BUG_ON(!upper->checked); 951 BUG_ON(!upper->checked);
746 INIT_LIST_HEAD(&edge->list[UPPER]); 952 INIT_LIST_HEAD(&edge->list[UPPER]);
953 if (!upper->owner)
954 upper->owner = btrfs_header_owner(eb);
747 } 955 }
748 list_add_tail(&edge->list[LOWER], &lower->upper); 956 list_add_tail(&edge->list[LOWER], &lower->upper);
749 edge->node[UPPER] = upper;
750 edge->node[LOWER] = lower; 957 edge->node[LOWER] = lower;
958 edge->node[UPPER] = upper;
751 959
752 if (rb_node) 960 if (rb_node)
753 break; 961 break;
@@ -785,8 +993,13 @@ next:
785 * into the cache. 993 * into the cache.
786 */ 994 */
787 BUG_ON(!node->checked); 995 BUG_ON(!node->checked);
788 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 996 cowonly = node->cowonly;
789 BUG_ON(rb_node); 997 if (!cowonly) {
998 rb_node = tree_insert(&cache->rb_root, node->bytenr,
999 &node->rb_node);
1000 BUG_ON(rb_node);
1001 list_add_tail(&node->lower, &cache->leaves);
1002 }
790 1003
791 list_for_each_entry(edge, &node->upper, list[LOWER]) 1004 list_for_each_entry(edge, &node->upper, list[LOWER])
792 list_add_tail(&edge->list[UPPER], &list); 1005 list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1008,14 @@ next:
795 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1008 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
796 list_del_init(&edge->list[UPPER]); 1009 list_del_init(&edge->list[UPPER]);
797 upper = edge->node[UPPER]; 1010 upper = edge->node[UPPER];
1011 if (upper->detached) {
1012 list_del(&edge->list[LOWER]);
1013 lower = edge->node[LOWER];
1014 free_backref_edge(cache, edge);
1015 if (list_empty(&lower->upper))
1016 list_add(&lower->list, &useless);
1017 continue;
1018 }
798 1019
799 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1020 if (!RB_EMPTY_NODE(&upper->rb_node)) {
800 if (upper->lowest) { 1021 if (upper->lowest) {
@@ -807,25 +1028,69 @@ next:
807 } 1028 }
808 1029
809 BUG_ON(!upper->checked); 1030 BUG_ON(!upper->checked);
810 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1031 BUG_ON(cowonly != upper->cowonly);
811 &upper->rb_node); 1032 if (!cowonly) {
812 BUG_ON(rb_node); 1033 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1034 &upper->rb_node);
1035 BUG_ON(rb_node);
1036 }
813 1037
814 list_add_tail(&edge->list[UPPER], &upper->lower); 1038 list_add_tail(&edge->list[UPPER], &upper->lower);
815 1039
816 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1040 list_for_each_entry(edge, &upper->upper, list[LOWER])
817 list_add_tail(&edge->list[UPPER], &list); 1041 list_add_tail(&edge->list[UPPER], &list);
818 } 1042 }
1043 /*
1044 * process useless backref nodes. backref nodes for tree leaves
1045 * are deleted from the cache. backref nodes for upper level
1046 * tree blocks are left in the cache to avoid unnecessary backref
1047 * lookup.
1048 */
1049 while (!list_empty(&useless)) {
1050 upper = list_entry(useless.next, struct backref_node, list);
1051 list_del_init(&upper->list);
1052 BUG_ON(!list_empty(&upper->upper));
1053 if (upper == node)
1054 node = NULL;
1055 if (upper->lowest) {
1056 list_del_init(&upper->lower);
1057 upper->lowest = 0;
1058 }
1059 while (!list_empty(&upper->lower)) {
1060 edge = list_entry(upper->lower.next,
1061 struct backref_edge, list[UPPER]);
1062 list_del(&edge->list[UPPER]);
1063 list_del(&edge->list[LOWER]);
1064 lower = edge->node[LOWER];
1065 free_backref_edge(cache, edge);
1066
1067 if (list_empty(&lower->upper))
1068 list_add(&lower->list, &useless);
1069 }
1070 __mark_block_processed(rc, upper);
1071 if (upper->level > 0) {
1072 list_add(&upper->list, &cache->detached);
1073 upper->detached = 1;
1074 } else {
1075 rb_erase(&upper->rb_node, &cache->rb_root);
1076 free_backref_node(cache, upper);
1077 }
1078 }
819out: 1079out:
820 btrfs_free_path(path1); 1080 btrfs_free_path(path1);
821 btrfs_free_path(path2); 1081 btrfs_free_path(path2);
822 if (err) { 1082 if (err) {
823 INIT_LIST_HEAD(&list); 1083 while (!list_empty(&useless)) {
1084 lower = list_entry(useless.next,
1085 struct backref_node, upper);
1086 list_del_init(&lower->upper);
1087 }
824 upper = node; 1088 upper = node;
1089 INIT_LIST_HEAD(&list);
825 while (upper) { 1090 while (upper) {
826 if (RB_EMPTY_NODE(&upper->rb_node)) { 1091 if (RB_EMPTY_NODE(&upper->rb_node)) {
827 list_splice_tail(&upper->upper, &list); 1092 list_splice_tail(&upper->upper, &list);
828 kfree(upper); 1093 free_backref_node(cache, upper);
829 } 1094 }
830 1095
831 if (list_empty(&list)) 1096 if (list_empty(&list))
@@ -833,15 +1098,104 @@ out:
833 1098
834 edge = list_entry(list.next, struct backref_edge, 1099 edge = list_entry(list.next, struct backref_edge,
835 list[LOWER]); 1100 list[LOWER]);
1101 list_del(&edge->list[LOWER]);
836 upper = edge->node[UPPER]; 1102 upper = edge->node[UPPER];
837 kfree(edge); 1103 free_backref_edge(cache, edge);
838 } 1104 }
839 return ERR_PTR(err); 1105 return ERR_PTR(err);
840 } 1106 }
1107 BUG_ON(node && node->detached);
841 return node; 1108 return node;
842} 1109}
843 1110
844/* 1111/*
1112 * helper to add backref node for the newly created snapshot.
1113 * the backref node is created by cloning backref node that
1114 * corresponds to root of source tree
1115 */
1116static int clone_backref_node(struct btrfs_trans_handle *trans,
1117 struct reloc_control *rc,
1118 struct btrfs_root *src,
1119 struct btrfs_root *dest)
1120{
1121 struct btrfs_root *reloc_root = src->reloc_root;
1122 struct backref_cache *cache = &rc->backref_cache;
1123 struct backref_node *node = NULL;
1124 struct backref_node *new_node;
1125 struct backref_edge *edge;
1126 struct backref_edge *new_edge;
1127 struct rb_node *rb_node;
1128
1129 if (cache->last_trans > 0)
1130 update_backref_cache(trans, cache);
1131
1132 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1133 if (rb_node) {
1134 node = rb_entry(rb_node, struct backref_node, rb_node);
1135 if (node->detached)
1136 node = NULL;
1137 else
1138 BUG_ON(node->new_bytenr != reloc_root->node->start);
1139 }
1140
1141 if (!node) {
1142 rb_node = tree_search(&cache->rb_root,
1143 reloc_root->commit_root->start);
1144 if (rb_node) {
1145 node = rb_entry(rb_node, struct backref_node,
1146 rb_node);
1147 BUG_ON(node->detached);
1148 }
1149 }
1150
1151 if (!node)
1152 return 0;
1153
1154 new_node = alloc_backref_node(cache);
1155 if (!new_node)
1156 return -ENOMEM;
1157
1158 new_node->bytenr = dest->node->start;
1159 new_node->level = node->level;
1160 new_node->lowest = node->lowest;
1161 new_node->root = dest;
1162
1163 if (!node->lowest) {
1164 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1165 new_edge = alloc_backref_edge(cache);
1166 if (!new_edge)
1167 goto fail;
1168
1169 new_edge->node[UPPER] = new_node;
1170 new_edge->node[LOWER] = edge->node[LOWER];
1171 list_add_tail(&new_edge->list[UPPER],
1172 &new_node->lower);
1173 }
1174 }
1175
1176 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1177 &new_node->rb_node);
1178 BUG_ON(rb_node);
1179
1180 if (!new_node->lowest) {
1181 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1182 list_add_tail(&new_edge->list[LOWER],
1183 &new_edge->node[LOWER]->upper);
1184 }
1185 }
1186 return 0;
1187fail:
1188 while (!list_empty(&new_node->lower)) {
1189 new_edge = list_entry(new_node->lower.next,
1190 struct backref_edge, list[UPPER]);
1191 list_del(&new_edge->list[UPPER]);
1192 free_backref_edge(cache, new_edge);
1193 }
1194 free_backref_node(cache, new_node);
1195 return -ENOMEM;
1196}
1197
1198/*
845 * helper to add 'address of tree root -> reloc tree' mapping 1199 * helper to add 'address of tree root -> reloc tree' mapping
846 */ 1200 */
847static int __add_reloc_root(struct btrfs_root *root) 1201static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
901 return 0; 1255 return 0;
902} 1256}
903 1257
904/* 1258static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
905 * create reloc tree for a given fs tree. reloc tree is just a 1259 struct btrfs_root *root, u64 objectid)
906 * snapshot of the fs tree with special root objectid.
907 */
908int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
909 struct btrfs_root *root)
910{ 1260{
911 struct btrfs_root *reloc_root; 1261 struct btrfs_root *reloc_root;
912 struct extent_buffer *eb; 1262 struct extent_buffer *eb;
@@ -914,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
914 struct btrfs_key root_key; 1264 struct btrfs_key root_key;
915 int ret; 1265 int ret;
916 1266
917 if (root->reloc_root) {
918 reloc_root = root->reloc_root;
919 reloc_root->last_trans = trans->transid;
920 return 0;
921 }
922
923 if (!root->fs_info->reloc_ctl ||
924 !root->fs_info->reloc_ctl->create_reloc_root ||
925 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
926 return 0;
927
928 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1267 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
929 BUG_ON(!root_item); 1268 BUG_ON(!root_item);
930 1269
931 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1270 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
932 root_key.type = BTRFS_ROOT_ITEM_KEY; 1271 root_key.type = BTRFS_ROOT_ITEM_KEY;
933 root_key.offset = root->root_key.objectid; 1272 root_key.offset = objectid;
934 1273
935 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1274 if (root->root_key.objectid == objectid) {
936 BTRFS_TREE_RELOC_OBJECTID); 1275 /* called by btrfs_init_reloc_root */
937 BUG_ON(ret); 1276 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1277 BTRFS_TREE_RELOC_OBJECTID);
1278 BUG_ON(ret);
1279
1280 btrfs_set_root_last_snapshot(&root->root_item,
1281 trans->transid - 1);
1282 } else {
1283 /*
1284 * called by btrfs_reloc_post_snapshot_hook.
1285 * the source tree is a reloc tree, all tree blocks
1286 * modified after it was created have RELOC flag
1287 * set in their headers. so it's OK to not update
1288 * the 'last_snapshot'.
1289 */
1290 ret = btrfs_copy_root(trans, root, root->node, &eb,
1291 BTRFS_TREE_RELOC_OBJECTID);
1292 BUG_ON(ret);
1293 }
938 1294
939 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
940 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1295 memcpy(root_item, &root->root_item, sizeof(*root_item));
941 btrfs_set_root_refs(root_item, 1);
942 btrfs_set_root_bytenr(root_item, eb->start); 1296 btrfs_set_root_bytenr(root_item, eb->start);
943 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1297 btrfs_set_root_level(root_item, btrfs_header_level(eb));
944 btrfs_set_root_generation(root_item, trans->transid); 1298 btrfs_set_root_generation(root_item, trans->transid);
945 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); 1299
946 root_item->drop_level = 0; 1300 if (root->root_key.objectid == objectid) {
1301 btrfs_set_root_refs(root_item, 0);
1302 memset(&root_item->drop_progress, 0,
1303 sizeof(struct btrfs_disk_key));
1304 root_item->drop_level = 0;
1305 }
947 1306
948 btrfs_tree_unlock(eb); 1307 btrfs_tree_unlock(eb);
949 free_extent_buffer(eb); 1308 free_extent_buffer(eb);
@@ -957,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
957 &root_key); 1316 &root_key);
958 BUG_ON(IS_ERR(reloc_root)); 1317 BUG_ON(IS_ERR(reloc_root));
959 reloc_root->last_trans = trans->transid; 1318 reloc_root->last_trans = trans->transid;
1319 return reloc_root;
1320}
1321
1322/*
1323 * create reloc tree for a given fs tree. reloc tree is just a
1324 * snapshot of the fs tree with special root objectid.
1325 */
1326int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root)
1328{
1329 struct btrfs_root *reloc_root;
1330 struct reloc_control *rc = root->fs_info->reloc_ctl;
1331 int clear_rsv = 0;
1332
1333 if (root->reloc_root) {
1334 reloc_root = root->reloc_root;
1335 reloc_root->last_trans = trans->transid;
1336 return 0;
1337 }
1338
1339 if (!rc || !rc->create_reloc_tree ||
1340 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1341 return 0;
1342
1343 if (!trans->block_rsv) {
1344 trans->block_rsv = rc->block_rsv;
1345 clear_rsv = 1;
1346 }
1347 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1348 if (clear_rsv)
1349 trans->block_rsv = NULL;
960 1350
961 __add_reloc_root(reloc_root); 1351 __add_reloc_root(reloc_root);
962 root->reloc_root = reloc_root; 1352 root->reloc_root = reloc_root;
@@ -980,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
980 reloc_root = root->reloc_root; 1370 reloc_root = root->reloc_root;
981 root_item = &reloc_root->root_item; 1371 root_item = &reloc_root->root_item;
982 1372
983 if (btrfs_root_refs(root_item) == 0) { 1373 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1374 btrfs_root_refs(root_item) == 0) {
984 root->reloc_root = NULL; 1375 root->reloc_root = NULL;
985 del = 1; 1376 del = 1;
986 } 1377 }
@@ -1102,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1102 goto out; 1493 goto out;
1103 } 1494 }
1104 1495
1105 if (new_bytenr) 1496 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1107 ret = 0; 1497 ret = 0;
1108out: 1498out:
1109 btrfs_free_path(path); 1499 btrfs_free_path(path);
@@ -1114,19 +1504,18 @@ out:
1114 * update file extent items in the tree leaf to point to 1504 * update file extent items in the tree leaf to point to
1115 * the new locations. 1505 * the new locations.
1116 */ 1506 */
1117static int replace_file_extents(struct btrfs_trans_handle *trans, 1507static noinline_for_stack
1118 struct reloc_control *rc, 1508int replace_file_extents(struct btrfs_trans_handle *trans,
1119 struct btrfs_root *root, 1509 struct reloc_control *rc,
1120 struct extent_buffer *leaf, 1510 struct btrfs_root *root,
1121 struct list_head *inode_list) 1511 struct extent_buffer *leaf)
1122{ 1512{
1123 struct btrfs_key key; 1513 struct btrfs_key key;
1124 struct btrfs_file_extent_item *fi; 1514 struct btrfs_file_extent_item *fi;
1125 struct inode *inode = NULL; 1515 struct inode *inode = NULL;
1126 struct inodevec *ivec = NULL;
1127 u64 parent; 1516 u64 parent;
1128 u64 bytenr; 1517 u64 bytenr;
1129 u64 new_bytenr; 1518 u64 new_bytenr = 0;
1130 u64 num_bytes; 1519 u64 num_bytes;
1131 u64 end; 1520 u64 end;
1132 u32 nritems; 1521 u32 nritems;
@@ -1166,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1166 * to complete and drop the extent cache 1555 * to complete and drop the extent cache
1167 */ 1556 */
1168 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1557 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1169 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1170 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1171 BUG_ON(!ivec);
1172 ivec->nr = 0;
1173 list_add_tail(&ivec->list, inode_list);
1174 }
1175 if (first) { 1558 if (first) {
1176 inode = find_next_inode(root, key.objectid); 1559 inode = find_next_inode(root, key.objectid);
1177 if (inode)
1178 ivec->inode[ivec->nr++] = inode;
1179 first = 0; 1560 first = 0;
1180 } else if (inode && inode->i_ino < key.objectid) { 1561 } else if (inode && inode->i_ino < key.objectid) {
1562 btrfs_add_delayed_iput(inode);
1181 inode = find_next_inode(root, key.objectid); 1563 inode = find_next_inode(root, key.objectid);
1182 if (inode)
1183 ivec->inode[ivec->nr++] = inode;
1184 } 1564 }
1185 if (inode && inode->i_ino == key.objectid) { 1565 if (inode && inode->i_ino == key.objectid) {
1186 end = key.offset + 1566 end = key.offset +
@@ -1204,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1204 1584
1205 ret = get_new_location(rc->data_inode, &new_bytenr, 1585 ret = get_new_location(rc->data_inode, &new_bytenr,
1206 bytenr, num_bytes); 1586 bytenr, num_bytes);
1207 if (ret > 0) 1587 if (ret > 0) {
1588 WARN_ON(1);
1208 continue; 1589 continue;
1590 }
1209 BUG_ON(ret < 0); 1591 BUG_ON(ret < 0);
1210 1592
1211 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1593 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1225 } 1607 }
1226 if (dirty) 1608 if (dirty)
1227 btrfs_mark_buffer_dirty(leaf); 1609 btrfs_mark_buffer_dirty(leaf);
1610 if (inode)
1611 btrfs_add_delayed_iput(inode);
1228 return 0; 1612 return 0;
1229} 1613}
1230 1614
@@ -1248,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
1248 * if no block got replaced, 0 is returned. if there are other 1632 * if no block got replaced, 0 is returned. if there are other
1249 * errors, a negative error number is returned. 1633 * errors, a negative error number is returned.
1250 */ 1634 */
1251static int replace_path(struct btrfs_trans_handle *trans, 1635static noinline_for_stack
1252 struct btrfs_root *dest, struct btrfs_root *src, 1636int replace_path(struct btrfs_trans_handle *trans,
1253 struct btrfs_path *path, struct btrfs_key *next_key, 1637 struct btrfs_root *dest, struct btrfs_root *src,
1254 struct extent_buffer **leaf, 1638 struct btrfs_path *path, struct btrfs_key *next_key,
1255 int lowest_level, int max_level) 1639 int lowest_level, int max_level)
1256{ 1640{
1257 struct extent_buffer *eb; 1641 struct extent_buffer *eb;
1258 struct extent_buffer *parent; 1642 struct extent_buffer *parent;
@@ -1263,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
1263 u64 new_ptr_gen; 1647 u64 new_ptr_gen;
1264 u64 last_snapshot; 1648 u64 last_snapshot;
1265 u32 blocksize; 1649 u32 blocksize;
1650 int cow = 0;
1266 int level; 1651 int level;
1267 int ret; 1652 int ret;
1268 int slot; 1653 int slot;
1269 1654
1270 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1655 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1271 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1656 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1272 BUG_ON(lowest_level > 1 && leaf);
1273 1657
1274 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1658 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1275 1659again:
1276 slot = path->slots[lowest_level]; 1660 slot = path->slots[lowest_level];
1277 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1661 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1278 1662
@@ -1286,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
1286 return 0; 1670 return 0;
1287 } 1671 }
1288 1672
1289 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1673 if (cow) {
1290 BUG_ON(ret); 1674 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1675 BUG_ON(ret);
1676 }
1291 btrfs_set_lock_blocking(eb); 1677 btrfs_set_lock_blocking(eb);
1292 1678
1293 if (next_key) { 1679 if (next_key) {
@@ -1331,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
1331 1717
1332 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1718 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1333 memcmp_node_keys(parent, slot, path, level)) { 1719 memcmp_node_keys(parent, slot, path, level)) {
1334 if (level <= lowest_level && !leaf) { 1720 if (level <= lowest_level) {
1335 ret = 0; 1721 ret = 0;
1336 break; 1722 break;
1337 } 1723 }
@@ -1339,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
1339 eb = read_tree_block(dest, old_bytenr, blocksize, 1725 eb = read_tree_block(dest, old_bytenr, blocksize,
1340 old_ptr_gen); 1726 old_ptr_gen);
1341 btrfs_tree_lock(eb); 1727 btrfs_tree_lock(eb);
1342 ret = btrfs_cow_block(trans, dest, eb, parent, 1728 if (cow) {
1343 slot, &eb); 1729 ret = btrfs_cow_block(trans, dest, eb, parent,
1344 BUG_ON(ret); 1730 slot, &eb);
1345 btrfs_set_lock_blocking(eb); 1731 BUG_ON(ret);
1346
1347 if (level <= lowest_level) {
1348 *leaf = eb;
1349 ret = 0;
1350 break;
1351 } 1732 }
1733 btrfs_set_lock_blocking(eb);
1352 1734
1353 btrfs_tree_unlock(parent); 1735 btrfs_tree_unlock(parent);
1354 free_extent_buffer(parent); 1736 free_extent_buffer(parent);
@@ -1357,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
1357 continue; 1739 continue;
1358 } 1740 }
1359 1741
1742 if (!cow) {
1743 btrfs_tree_unlock(parent);
1744 free_extent_buffer(parent);
1745 cow = 1;
1746 goto again;
1747 }
1748
1360 btrfs_node_key_to_cpu(path->nodes[level], &key, 1749 btrfs_node_key_to_cpu(path->nodes[level], &key,
1361 path->slots[level]); 1750 path->slots[level]);
1362 btrfs_release_path(src, path); 1751 btrfs_release_path(src, path);
@@ -1562,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1562 return 0; 1951 return 0;
1563} 1952}
1564 1953
1565static void put_inodes(struct list_head *list)
1566{
1567 struct inodevec *ivec;
1568 while (!list_empty(list)) {
1569 ivec = list_entry(list->next, struct inodevec, list);
1570 list_del(&ivec->list);
1571 while (ivec->nr > 0) {
1572 ivec->nr--;
1573 iput(ivec->inode[ivec->nr]);
1574 }
1575 kfree(ivec);
1576 }
1577}
1578
1579static int find_next_key(struct btrfs_path *path, int level, 1954static int find_next_key(struct btrfs_path *path, int level,
1580 struct btrfs_key *key) 1955 struct btrfs_key *key)
1581 1956
@@ -1608,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1608 struct btrfs_root *reloc_root; 1983 struct btrfs_root *reloc_root;
1609 struct btrfs_root_item *root_item; 1984 struct btrfs_root_item *root_item;
1610 struct btrfs_path *path; 1985 struct btrfs_path *path;
1611 struct extent_buffer *leaf = NULL; 1986 struct extent_buffer *leaf;
1612 unsigned long nr; 1987 unsigned long nr;
1613 int level; 1988 int level;
1614 int max_level; 1989 int max_level;
1615 int replaced = 0; 1990 int replaced = 0;
1616 int ret; 1991 int ret;
1617 int err = 0; 1992 int err = 0;
1993 u32 min_reserved;
1618 1994
1619 path = btrfs_alloc_path(); 1995 path = btrfs_alloc_path();
1620 if (!path) 1996 if (!path)
@@ -1648,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1648 btrfs_unlock_up_safe(path, 0); 2024 btrfs_unlock_up_safe(path, 0);
1649 } 2025 }
1650 2026
1651 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { 2027 min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1652 trans = btrfs_start_transaction(root, 1); 2028 memset(&next_key, 0, sizeof(next_key));
1653 2029
1654 leaf = path->nodes[0]; 2030 while (1) {
1655 btrfs_item_key_to_cpu(leaf, &key, 0); 2031 trans = btrfs_start_transaction(root, 0);
1656 btrfs_release_path(reloc_root, path); 2032 trans->block_rsv = rc->block_rsv;
1657 2033
1658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2034 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
1659 if (ret < 0) { 2035 min_reserved, 0);
1660 err = ret; 2036 if (ret) {
1661 goto out; 2037 BUG_ON(ret != -EAGAIN);
2038 ret = btrfs_commit_transaction(trans, root);
2039 BUG_ON(ret);
2040 continue;
1662 } 2041 }
1663 2042
1664 leaf = path->nodes[0];
1665 btrfs_unlock_up_safe(path, 1);
1666 ret = replace_file_extents(trans, rc, root, leaf,
1667 &inode_list);
1668 if (ret < 0)
1669 err = ret;
1670 goto out;
1671 }
1672
1673 memset(&next_key, 0, sizeof(next_key));
1674
1675 while (1) {
1676 leaf = NULL;
1677 replaced = 0; 2043 replaced = 0;
1678 trans = btrfs_start_transaction(root, 1);
1679 max_level = level; 2044 max_level = level;
1680 2045
1681 ret = walk_down_reloc_tree(reloc_root, path, &level); 2046 ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1689 if (!find_next_key(path, level, &key) && 2054 if (!find_next_key(path, level, &key) &&
1690 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2055 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1691 ret = 0; 2056 ret = 0;
1692 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1693 ret = replace_path(trans, root, reloc_root,
1694 path, &next_key, &leaf,
1695 level, max_level);
1696 } else { 2057 } else {
1697 ret = replace_path(trans, root, reloc_root, 2058 ret = replace_path(trans, root, reloc_root, path,
1698 path, &next_key, NULL, 2059 &next_key, level, max_level);
1699 level, max_level);
1700 } 2060 }
1701 if (ret < 0) { 2061 if (ret < 0) {
1702 err = ret; 2062 err = ret;
@@ -1708,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1708 btrfs_node_key_to_cpu(path->nodes[level], &key, 2068 btrfs_node_key_to_cpu(path->nodes[level], &key,
1709 path->slots[level]); 2069 path->slots[level]);
1710 replaced = 1; 2070 replaced = 1;
1711 } else if (leaf) {
1712 /*
1713 * no block got replaced, try replacing file extents
1714 */
1715 btrfs_item_key_to_cpu(leaf, &key, 0);
1716 ret = replace_file_extents(trans, rc, root, leaf,
1717 &inode_list);
1718 btrfs_tree_unlock(leaf);
1719 free_extent_buffer(leaf);
1720 BUG_ON(ret < 0);
1721 } 2071 }
1722 2072
1723 ret = walk_up_reloc_tree(reloc_root, path, &level); 2073 ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1734 root_item->drop_level = level; 2084 root_item->drop_level = level;
1735 2085
1736 nr = trans->blocks_used; 2086 nr = trans->blocks_used;
1737 btrfs_end_transaction(trans, root); 2087 btrfs_end_transaction_throttle(trans, root);
1738 2088
1739 btrfs_btree_balance_dirty(root, nr); 2089 btrfs_btree_balance_dirty(root, nr);
1740 2090
1741 /*
1742 * put inodes outside transaction, otherwise we may deadlock.
1743 */
1744 put_inodes(&inode_list);
1745
1746 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2091 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1747 invalidate_extent_cache(root, &key, &next_key); 2092 invalidate_extent_cache(root, &key, &next_key);
1748 } 2093 }
@@ -1765,87 +2110,125 @@ out:
1765 sizeof(root_item->drop_progress)); 2110 sizeof(root_item->drop_progress));
1766 root_item->drop_level = 0; 2111 root_item->drop_level = 0;
1767 btrfs_set_root_refs(root_item, 0); 2112 btrfs_set_root_refs(root_item, 0);
2113 btrfs_update_reloc_root(trans, root);
1768 } 2114 }
1769 2115
1770 nr = trans->blocks_used; 2116 nr = trans->blocks_used;
1771 btrfs_end_transaction(trans, root); 2117 btrfs_end_transaction_throttle(trans, root);
1772 2118
1773 btrfs_btree_balance_dirty(root, nr); 2119 btrfs_btree_balance_dirty(root, nr);
1774 2120
1775 put_inodes(&inode_list);
1776
1777 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2121 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1778 invalidate_extent_cache(root, &key, &next_key); 2122 invalidate_extent_cache(root, &key, &next_key);
1779 2123
1780 return err; 2124 return err;
1781} 2125}
1782 2126
1783/* 2127static noinline_for_stack
1784 * callback for the work threads. 2128int prepare_to_merge(struct reloc_control *rc, int err)
1785 * this function merges reloc tree with corresponding fs tree,
1786 * and then drops the reloc tree.
1787 */
1788static void merge_func(struct btrfs_work *work)
1789{ 2129{
1790 struct btrfs_trans_handle *trans; 2130 struct btrfs_root *root = rc->extent_root;
1791 struct btrfs_root *root;
1792 struct btrfs_root *reloc_root; 2131 struct btrfs_root *reloc_root;
1793 struct async_merge *async; 2132 struct btrfs_trans_handle *trans;
2133 LIST_HEAD(reloc_roots);
2134 u64 num_bytes = 0;
2135 int ret;
2136 int retries = 0;
2137
2138 mutex_lock(&root->fs_info->trans_mutex);
2139 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2140 rc->merging_rsv_size += rc->nodes_relocated * 2;
2141 mutex_unlock(&root->fs_info->trans_mutex);
2142again:
2143 if (!err) {
2144 num_bytes = rc->merging_rsv_size;
2145 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2146 num_bytes, &retries);
2147 if (ret)
2148 err = ret;
2149 }
2150
2151 trans = btrfs_join_transaction(rc->extent_root, 1);
2152
2153 if (!err) {
2154 if (num_bytes != rc->merging_rsv_size) {
2155 btrfs_end_transaction(trans, rc->extent_root);
2156 btrfs_block_rsv_release(rc->extent_root,
2157 rc->block_rsv, num_bytes);
2158 retries = 0;
2159 goto again;
2160 }
2161 }
1794 2162
1795 async = container_of(work, struct async_merge, work); 2163 rc->merge_reloc_tree = 1;
1796 reloc_root = async->root; 2164
2165 while (!list_empty(&rc->reloc_roots)) {
2166 reloc_root = list_entry(rc->reloc_roots.next,
2167 struct btrfs_root, root_list);
2168 list_del_init(&reloc_root->root_list);
1797 2169
1798 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1799 root = read_fs_root(reloc_root->fs_info, 2170 root = read_fs_root(reloc_root->fs_info,
1800 reloc_root->root_key.offset); 2171 reloc_root->root_key.offset);
1801 BUG_ON(IS_ERR(root)); 2172 BUG_ON(IS_ERR(root));
1802 BUG_ON(root->reloc_root != reloc_root); 2173 BUG_ON(root->reloc_root != reloc_root);
1803 2174
1804 merge_reloc_root(async->rc, root); 2175 /*
1805 2176 * set reference count to 1, so btrfs_recover_relocation
1806 trans = btrfs_start_transaction(root, 1); 2177 * knows it should resumes merging
2178 */
2179 if (!err)
2180 btrfs_set_root_refs(&reloc_root->root_item, 1);
1807 btrfs_update_reloc_root(trans, root); 2181 btrfs_update_reloc_root(trans, root);
1808 btrfs_end_transaction(trans, root);
1809 }
1810 2182
1811 btrfs_drop_snapshot(reloc_root, 0); 2183 list_add(&reloc_root->root_list, &reloc_roots);
2184 }
1812 2185
1813 if (atomic_dec_and_test(async->num_pending)) 2186 list_splice(&reloc_roots, &rc->reloc_roots);
1814 complete(async->done);
1815 2187
1816 kfree(async); 2188 if (!err)
2189 btrfs_commit_transaction(trans, rc->extent_root);
2190 else
2191 btrfs_end_transaction(trans, rc->extent_root);
2192 return err;
1817} 2193}
1818 2194
1819static int merge_reloc_roots(struct reloc_control *rc) 2195static noinline_for_stack
2196int merge_reloc_roots(struct reloc_control *rc)
1820{ 2197{
1821 struct async_merge *async;
1822 struct btrfs_root *root; 2198 struct btrfs_root *root;
1823 struct completion done; 2199 struct btrfs_root *reloc_root;
1824 atomic_t num_pending; 2200 LIST_HEAD(reloc_roots);
2201 int found = 0;
2202 int ret;
2203again:
2204 root = rc->extent_root;
2205 mutex_lock(&root->fs_info->trans_mutex);
2206 list_splice_init(&rc->reloc_roots, &reloc_roots);
2207 mutex_unlock(&root->fs_info->trans_mutex);
1825 2208
1826 init_completion(&done); 2209 while (!list_empty(&reloc_roots)) {
1827 atomic_set(&num_pending, 1); 2210 found = 1;
2211 reloc_root = list_entry(reloc_roots.next,
2212 struct btrfs_root, root_list);
1828 2213
1829 while (!list_empty(&rc->reloc_roots)) { 2214 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1830 root = list_entry(rc->reloc_roots.next, 2215 root = read_fs_root(reloc_root->fs_info,
1831 struct btrfs_root, root_list); 2216 reloc_root->root_key.offset);
1832 list_del_init(&root->root_list); 2217 BUG_ON(IS_ERR(root));
2218 BUG_ON(root->reloc_root != reloc_root);
1833 2219
1834 async = kmalloc(sizeof(*async), GFP_NOFS); 2220 ret = merge_reloc_root(rc, root);
1835 BUG_ON(!async); 2221 BUG_ON(ret);
1836 async->work.func = merge_func; 2222 } else {
1837 async->work.flags = 0; 2223 list_del_init(&reloc_root->root_list);
1838 async->rc = rc; 2224 }
1839 async->root = root; 2225 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
1840 async->done = &done;
1841 async->num_pending = &num_pending;
1842 atomic_inc(&num_pending);
1843 btrfs_queue_worker(&rc->workers, &async->work);
1844 } 2226 }
1845 2227
1846 if (!atomic_dec_and_test(&num_pending)) 2228 if (found) {
1847 wait_for_completion(&done); 2229 found = 0;
1848 2230 goto again;
2231 }
1849 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2232 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1850 return 0; 2233 return 0;
1851} 2234}
@@ -1876,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1876 return btrfs_record_root_in_trans(trans, root); 2259 return btrfs_record_root_in_trans(trans, root);
1877} 2260}
1878 2261
1879/* 2262static noinline_for_stack
1880 * select one tree from trees that references the block. 2263struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1881 * for blocks in refernce counted trees, we preper reloc tree. 2264 struct reloc_control *rc,
1882 * if no reloc tree found and reloc_only is true, NULL is returned. 2265 struct backref_node *node,
1883 */ 2266 struct backref_edge *edges[], int *nr)
1884static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1885 struct backref_node *node,
1886 struct backref_edge *edges[],
1887 int *nr, int reloc_only)
1888{ 2267{
1889 struct backref_node *next; 2268 struct backref_node *next;
1890 struct btrfs_root *root; 2269 struct btrfs_root *root;
1891 int index; 2270 int index = 0;
1892 int loop = 0; 2271
1893again:
1894 index = 0;
1895 next = node; 2272 next = node;
1896 while (1) { 2273 while (1) {
1897 cond_resched(); 2274 cond_resched();
1898 next = walk_up_backref(next, edges, &index); 2275 next = walk_up_backref(next, edges, &index);
1899 root = next->root; 2276 root = next->root;
1900 if (!root) { 2277 BUG_ON(!root);
1901 BUG_ON(!node->old_root); 2278 BUG_ON(!root->ref_cows);
1902 goto skip;
1903 }
1904
1905 /* no other choice for non-refernce counted tree */
1906 if (!root->ref_cows) {
1907 BUG_ON(reloc_only);
1908 break;
1909 }
1910 2279
1911 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2280 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1912 record_reloc_root_in_trans(trans, root); 2281 record_reloc_root_in_trans(trans, root);
1913 break; 2282 break;
1914 } 2283 }
1915 2284
1916 if (loop) { 2285 btrfs_record_root_in_trans(trans, root);
1917 btrfs_record_root_in_trans(trans, root); 2286 root = root->reloc_root;
2287
2288 if (next->new_bytenr != root->node->start) {
2289 BUG_ON(next->new_bytenr);
2290 BUG_ON(!list_empty(&next->list));
2291 next->new_bytenr = root->node->start;
2292 next->root = root;
2293 list_add_tail(&next->list,
2294 &rc->backref_cache.changed);
2295 __mark_block_processed(rc, next);
1918 break; 2296 break;
1919 } 2297 }
1920 2298
1921 if (reloc_only || next != node) { 2299 WARN_ON(1);
1922 if (!root->reloc_root)
1923 btrfs_record_root_in_trans(trans, root);
1924 root = root->reloc_root;
1925 /*
1926 * if the reloc tree was created in current
1927 * transation, there is no node in backref tree
1928 * corresponds to the root of the reloc tree.
1929 */
1930 if (btrfs_root_last_snapshot(&root->root_item) ==
1931 trans->transid - 1)
1932 break;
1933 }
1934skip:
1935 root = NULL; 2300 root = NULL;
1936 next = walk_down_backref(edges, &index); 2301 next = walk_down_backref(edges, &index);
1937 if (!next || next->level <= node->level) 2302 if (!next || next->level <= node->level)
1938 break; 2303 break;
1939 } 2304 }
2305 if (!root)
2306 return NULL;
1940 2307
1941 if (!root && !loop && !reloc_only) { 2308 *nr = index;
1942 loop = 1; 2309 next = node;
1943 goto again; 2310 /* setup backref node path for btrfs_reloc_cow_block */
2311 while (1) {
2312 rc->backref_cache.path[next->level] = next;
2313 if (--index < 0)
2314 break;
2315 next = edges[index]->node[UPPER];
1944 } 2316 }
1945
1946 if (root)
1947 *nr = index;
1948 else
1949 *nr = 0;
1950
1951 return root; 2317 return root;
1952} 2318}
1953 2319
2320/*
2321 * select a tree root for relocation. return NULL if the block
2322 * is reference counted. we should use do_relocation() in this
2323 * case. return a tree root pointer if the block isn't reference
2324 * counted. return -ENOENT if the block is root of reloc tree.
2325 */
1954static noinline_for_stack 2326static noinline_for_stack
1955struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, 2327struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1956 struct backref_node *node) 2328 struct backref_node *node)
1957{ 2329{
2330 struct backref_node *next;
2331 struct btrfs_root *root;
2332 struct btrfs_root *fs_root = NULL;
1958 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2333 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1959 int nr; 2334 int index = 0;
1960 return __select_one_root(trans, node, edges, &nr, 0); 2335
2336 next = node;
2337 while (1) {
2338 cond_resched();
2339 next = walk_up_backref(next, edges, &index);
2340 root = next->root;
2341 BUG_ON(!root);
2342
2343 /* no other choice for non-refernce counted tree */
2344 if (!root->ref_cows)
2345 return root;
2346
2347 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2348 fs_root = root;
2349
2350 if (next != node)
2351 return NULL;
2352
2353 next = walk_down_backref(edges, &index);
2354 if (!next || next->level <= node->level)
2355 break;
2356 }
2357
2358 if (!fs_root)
2359 return ERR_PTR(-ENOENT);
2360 return fs_root;
1961} 2361}
1962 2362
1963static noinline_for_stack 2363static noinline_for_stack
1964struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2364u64 calcu_metadata_size(struct reloc_control *rc,
1965 struct backref_node *node, 2365 struct backref_node *node, int reserve)
1966 struct backref_edge *edges[], int *nr)
1967{ 2366{
1968 return __select_one_root(trans, node, edges, nr, 1); 2367 struct backref_node *next = node;
2368 struct backref_edge *edge;
2369 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2370 u64 num_bytes = 0;
2371 int index = 0;
2372
2373 BUG_ON(reserve && node->processed);
2374
2375 while (next) {
2376 cond_resched();
2377 while (1) {
2378 if (next->processed && (reserve || next != node))
2379 break;
2380
2381 num_bytes += btrfs_level_size(rc->extent_root,
2382 next->level);
2383
2384 if (list_empty(&next->upper))
2385 break;
2386
2387 edge = list_entry(next->upper.next,
2388 struct backref_edge, list[LOWER]);
2389 edges[index++] = edge;
2390 next = edge->node[UPPER];
2391 }
2392 next = walk_down_backref(edges, &index);
2393 }
2394 return num_bytes;
1969} 2395}
1970 2396
1971static void grab_path_buffers(struct btrfs_path *path, 2397static int reserve_metadata_space(struct btrfs_trans_handle *trans,
1972 struct backref_node *node, 2398 struct reloc_control *rc,
1973 struct backref_edge *edges[], int nr) 2399 struct backref_node *node)
1974{ 2400{
1975 int i = 0; 2401 struct btrfs_root *root = rc->extent_root;
1976 while (1) { 2402 u64 num_bytes;
1977 drop_node_buffer(node); 2403 int ret;
1978 node->eb = path->nodes[node->level]; 2404
1979 BUG_ON(!node->eb); 2405 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
1980 if (path->locks[node->level])
1981 node->locked = 1;
1982 path->nodes[node->level] = NULL;
1983 path->locks[node->level] = 0;
1984
1985 if (i >= nr)
1986 break;
1987 2406
1988 edges[i]->blockptr = node->eb->start; 2407 trans->block_rsv = rc->block_rsv;
1989 node = edges[i]->node[UPPER]; 2408 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
1990 i++; 2409 &rc->block_rsv_retries);
2410 if (ret) {
2411 if (ret == -EAGAIN)
2412 rc->commit_transaction = 1;
2413 return ret;
1991 } 2414 }
2415
2416 rc->block_rsv_retries = 0;
2417 return 0;
2418}
2419
2420static void release_metadata_space(struct reloc_control *rc,
2421 struct backref_node *node)
2422{
2423 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2424 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
1992} 2425}
1993 2426
1994/* 2427/*
@@ -1999,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path,
1999 * in that case this function just updates pointers. 2432 * in that case this function just updates pointers.
2000 */ 2433 */
2001static int do_relocation(struct btrfs_trans_handle *trans, 2434static int do_relocation(struct btrfs_trans_handle *trans,
2435 struct reloc_control *rc,
2002 struct backref_node *node, 2436 struct backref_node *node,
2003 struct btrfs_key *key, 2437 struct btrfs_key *key,
2004 struct btrfs_path *path, int lowest) 2438 struct btrfs_path *path, int lowest)
@@ -2019,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2019 BUG_ON(lowest && node->eb); 2453 BUG_ON(lowest && node->eb);
2020 2454
2021 path->lowest_level = node->level + 1; 2455 path->lowest_level = node->level + 1;
2456 rc->backref_cache.path[node->level] = node;
2022 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2457 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2023 cond_resched(); 2458 cond_resched();
2024 if (node->eb && node->eb->start == edge->blockptr)
2025 continue;
2026 2459
2027 upper = edge->node[UPPER]; 2460 upper = edge->node[UPPER];
2028 root = select_reloc_root(trans, upper, edges, &nr); 2461 root = select_reloc_root(trans, rc, upper, edges, &nr);
2029 if (!root) 2462 BUG_ON(!root);
2030 continue; 2463
2031 2464 if (upper->eb && !upper->locked) {
2032 if (upper->eb && !upper->locked) 2465 if (!lowest) {
2466 ret = btrfs_bin_search(upper->eb, key,
2467 upper->level, &slot);
2468 BUG_ON(ret);
2469 bytenr = btrfs_node_blockptr(upper->eb, slot);
2470 if (node->eb->start == bytenr)
2471 goto next;
2472 }
2033 drop_node_buffer(upper); 2473 drop_node_buffer(upper);
2474 }
2034 2475
2035 if (!upper->eb) { 2476 if (!upper->eb) {
2036 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2477 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2040 } 2481 }
2041 BUG_ON(ret > 0); 2482 BUG_ON(ret > 0);
2042 2483
2043 slot = path->slots[upper->level]; 2484 if (!upper->eb) {
2485 upper->eb = path->nodes[upper->level];
2486 path->nodes[upper->level] = NULL;
2487 } else {
2488 BUG_ON(upper->eb != path->nodes[upper->level]);
2489 }
2044 2490
2045 btrfs_unlock_up_safe(path, upper->level + 1); 2491 upper->locked = 1;
2046 grab_path_buffers(path, upper, edges, nr); 2492 path->locks[upper->level] = 0;
2047 2493
2494 slot = path->slots[upper->level];
2048 btrfs_release_path(NULL, path); 2495 btrfs_release_path(NULL, path);
2049 } else { 2496 } else {
2050 ret = btrfs_bin_search(upper->eb, key, upper->level, 2497 ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2053 } 2500 }
2054 2501
2055 bytenr = btrfs_node_blockptr(upper->eb, slot); 2502 bytenr = btrfs_node_blockptr(upper->eb, slot);
2056 if (!lowest) { 2503 if (lowest) {
2057 if (node->eb->start == bytenr) { 2504 BUG_ON(bytenr != node->bytenr);
2058 btrfs_tree_unlock(upper->eb);
2059 upper->locked = 0;
2060 continue;
2061 }
2062 } else { 2505 } else {
2063 BUG_ON(node->bytenr != bytenr); 2506 if (node->eb->start == bytenr)
2507 goto next;
2064 } 2508 }
2065 2509
2066 blocksize = btrfs_level_size(root, node->level); 2510 blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2072 if (!node->eb) { 2516 if (!node->eb) {
2073 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2517 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2074 slot, &eb); 2518 slot, &eb);
2519 btrfs_tree_unlock(eb);
2520 free_extent_buffer(eb);
2075 if (ret < 0) { 2521 if (ret < 0) {
2076 err = ret; 2522 err = ret;
2077 break; 2523 goto next;
2078 } 2524 }
2079 btrfs_set_lock_blocking(eb); 2525 BUG_ON(node->eb != eb);
2080 node->eb = eb;
2081 node->locked = 1;
2082 } else { 2526 } else {
2083 btrfs_set_node_blockptr(upper->eb, slot, 2527 btrfs_set_node_blockptr(upper->eb, slot,
2084 node->eb->start); 2528 node->eb->start);
@@ -2096,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2096 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2540 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2097 BUG_ON(ret); 2541 BUG_ON(ret);
2098 } 2542 }
2099 if (!lowest) { 2543next:
2100 btrfs_tree_unlock(upper->eb); 2544 if (!upper->pending)
2101 upper->locked = 0; 2545 drop_node_buffer(upper);
2102 } 2546 else
2547 unlock_node_buffer(upper);
2548 if (err)
2549 break;
2103 } 2550 }
2551
2552 if (!err && node->pending) {
2553 drop_node_buffer(node);
2554 list_move_tail(&node->list, &rc->backref_cache.changed);
2555 node->pending = 0;
2556 }
2557
2104 path->lowest_level = 0; 2558 path->lowest_level = 0;
2559 BUG_ON(err == -ENOSPC);
2105 return err; 2560 return err;
2106} 2561}
2107 2562
2108static int link_to_upper(struct btrfs_trans_handle *trans, 2563static int link_to_upper(struct btrfs_trans_handle *trans,
2564 struct reloc_control *rc,
2109 struct backref_node *node, 2565 struct backref_node *node,
2110 struct btrfs_path *path) 2566 struct btrfs_path *path)
2111{ 2567{
2112 struct btrfs_key key; 2568 struct btrfs_key key;
2113 if (!node->eb || list_empty(&node->upper))
2114 return 0;
2115 2569
2116 btrfs_node_key_to_cpu(node->eb, &key, 0); 2570 btrfs_node_key_to_cpu(node->eb, &key, 0);
2117 return do_relocation(trans, node, &key, path, 0); 2571 return do_relocation(trans, rc, node, &key, path, 0);
2118} 2572}
2119 2573
2120static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2574static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2121 struct backref_cache *cache, 2575 struct reloc_control *rc,
2122 struct btrfs_path *path) 2576 struct btrfs_path *path, int err)
2123{ 2577{
2578 LIST_HEAD(list);
2579 struct backref_cache *cache = &rc->backref_cache;
2124 struct backref_node *node; 2580 struct backref_node *node;
2125 int level; 2581 int level;
2126 int ret; 2582 int ret;
2127 int err = 0;
2128 2583
2129 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2584 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2130 while (!list_empty(&cache->pending[level])) { 2585 while (!list_empty(&cache->pending[level])) {
2131 node = list_entry(cache->pending[level].next, 2586 node = list_entry(cache->pending[level].next,
2132 struct backref_node, lower); 2587 struct backref_node, list);
2133 BUG_ON(node->level != level); 2588 list_move_tail(&node->list, &list);
2589 BUG_ON(!node->pending);
2134 2590
2135 ret = link_to_upper(trans, node, path); 2591 if (!err) {
2136 if (ret < 0) 2592 ret = link_to_upper(trans, rc, node, path);
2137 err = ret; 2593 if (ret < 0)
2138 /* 2594 err = ret;
2139 * this remove the node from the pending list and 2595 }
2140 * may add some other nodes to the level + 1
2141 * pending list
2142 */
2143 remove_backref_node(cache, node);
2144 } 2596 }
2597 list_splice_init(&list, &cache->pending[level]);
2145 } 2598 }
2146 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2147 return err; 2599 return err;
2148} 2600}
2149 2601
2150static void mark_block_processed(struct reloc_control *rc, 2602static void mark_block_processed(struct reloc_control *rc,
2151 struct backref_node *node) 2603 u64 bytenr, u32 blocksize)
2604{
2605 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2606 EXTENT_DIRTY, GFP_NOFS);
2607}
2608
2609static void __mark_block_processed(struct reloc_control *rc,
2610 struct backref_node *node)
2152{ 2611{
2153 u32 blocksize; 2612 u32 blocksize;
2154 if (node->level == 0 || 2613 if (node->level == 0 ||
2155 in_block_group(node->bytenr, rc->block_group)) { 2614 in_block_group(node->bytenr, rc->block_group)) {
2156 blocksize = btrfs_level_size(rc->extent_root, node->level); 2615 blocksize = btrfs_level_size(rc->extent_root, node->level);
2157 set_extent_bits(&rc->processed_blocks, node->bytenr, 2616 mark_block_processed(rc, node->bytenr, blocksize);
2158 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2159 GFP_NOFS);
2160 } 2617 }
2161 node->processed = 1; 2618 node->processed = 1;
2162} 2619}
@@ -2179,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc,
2179 if (next->processed) 2636 if (next->processed)
2180 break; 2637 break;
2181 2638
2182 mark_block_processed(rc, next); 2639 __mark_block_processed(rc, next);
2183 2640
2184 if (list_empty(&next->upper)) 2641 if (list_empty(&next->upper))
2185 break; 2642 break;
@@ -2202,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2202 return 0; 2659 return 0;
2203} 2660}
2204 2661
2205/*
2206 * check if there are any file extent pointers in the leaf point to
2207 * data require processing
2208 */
2209static int check_file_extents(struct reloc_control *rc,
2210 u64 bytenr, u32 blocksize, u64 ptr_gen)
2211{
2212 struct btrfs_key found_key;
2213 struct btrfs_file_extent_item *fi;
2214 struct extent_buffer *leaf;
2215 u32 nritems;
2216 int i;
2217 int ret = 0;
2218
2219 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2220
2221 nritems = btrfs_header_nritems(leaf);
2222 for (i = 0; i < nritems; i++) {
2223 cond_resched();
2224 btrfs_item_key_to_cpu(leaf, &found_key, i);
2225 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2226 continue;
2227 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2228 if (btrfs_file_extent_type(leaf, fi) ==
2229 BTRFS_FILE_EXTENT_INLINE)
2230 continue;
2231 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2232 if (bytenr == 0)
2233 continue;
2234 if (in_block_group(bytenr, rc->block_group)) {
2235 ret = 1;
2236 break;
2237 }
2238 }
2239 free_extent_buffer(leaf);
2240 return ret;
2241}
2242
2243/*
2244 * scan child blocks of a given block to find blocks require processing
2245 */
2246static int add_child_blocks(struct btrfs_trans_handle *trans,
2247 struct reloc_control *rc,
2248 struct backref_node *node,
2249 struct rb_root *blocks)
2250{
2251 struct tree_block *block;
2252 struct rb_node *rb_node;
2253 u64 bytenr;
2254 u64 ptr_gen;
2255 u32 blocksize;
2256 u32 nritems;
2257 int i;
2258 int err = 0;
2259
2260 nritems = btrfs_header_nritems(node->eb);
2261 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2262 for (i = 0; i < nritems; i++) {
2263 cond_resched();
2264 bytenr = btrfs_node_blockptr(node->eb, i);
2265 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2266 if (ptr_gen == trans->transid)
2267 continue;
2268 if (!in_block_group(bytenr, rc->block_group) &&
2269 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2270 continue;
2271 if (tree_block_processed(bytenr, blocksize, rc))
2272 continue;
2273
2274 readahead_tree_block(rc->extent_root,
2275 bytenr, blocksize, ptr_gen);
2276 }
2277
2278 for (i = 0; i < nritems; i++) {
2279 cond_resched();
2280 bytenr = btrfs_node_blockptr(node->eb, i);
2281 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2282 if (ptr_gen == trans->transid)
2283 continue;
2284 if (!in_block_group(bytenr, rc->block_group) &&
2285 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2286 continue;
2287 if (tree_block_processed(bytenr, blocksize, rc))
2288 continue;
2289 if (!in_block_group(bytenr, rc->block_group) &&
2290 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2291 continue;
2292
2293 block = kmalloc(sizeof(*block), GFP_NOFS);
2294 if (!block) {
2295 err = -ENOMEM;
2296 break;
2297 }
2298 block->bytenr = bytenr;
2299 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2300 block->level = node->level - 1;
2301 block->key_ready = 1;
2302 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2303 BUG_ON(rb_node);
2304 }
2305 if (err)
2306 free_block_list(blocks);
2307 return err;
2308}
2309
2310/*
2311 * find adjacent blocks require processing
2312 */
2313static noinline_for_stack
2314int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2315 struct reloc_control *rc,
2316 struct backref_cache *cache,
2317 struct rb_root *blocks, int level,
2318 struct backref_node **upper)
2319{
2320 struct backref_node *node;
2321 int ret = 0;
2322
2323 WARN_ON(!list_empty(&cache->pending[level]));
2324
2325 if (list_empty(&cache->pending[level + 1]))
2326 return 1;
2327
2328 node = list_entry(cache->pending[level + 1].next,
2329 struct backref_node, lower);
2330 if (node->eb)
2331 ret = add_child_blocks(trans, rc, node, blocks);
2332
2333 *upper = node;
2334 return ret;
2335}
2336
2337static int get_tree_block_key(struct reloc_control *rc, 2662static int get_tree_block_key(struct reloc_control *rc,
2338 struct tree_block *block) 2663 struct tree_block *block)
2339{ 2664{
@@ -2371,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2371 struct btrfs_path *path) 2696 struct btrfs_path *path)
2372{ 2697{
2373 struct btrfs_root *root; 2698 struct btrfs_root *root;
2374 int ret; 2699 int release = 0;
2700 int ret = 0;
2375 2701
2702 if (!node)
2703 return 0;
2704
2705 BUG_ON(node->processed);
2376 root = select_one_root(trans, node); 2706 root = select_one_root(trans, node);
2377 if (unlikely(!root)) { 2707 if (root == ERR_PTR(-ENOENT)) {
2378 rc->found_old_snapshot = 1;
2379 update_processed_blocks(rc, node); 2708 update_processed_blocks(rc, node);
2380 return 0; 2709 goto out;
2381 } 2710 }
2382 2711
2383 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2712 if (!root || root->ref_cows) {
2384 ret = do_relocation(trans, node, key, path, 1); 2713 ret = reserve_metadata_space(trans, rc, node);
2385 if (ret < 0) 2714 if (ret)
2386 goto out;
2387 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2388 ret = replace_file_extents(trans, rc, root,
2389 node->eb, NULL);
2390 if (ret < 0)
2391 goto out;
2392 }
2393 drop_node_buffer(node);
2394 } else if (!root->ref_cows) {
2395 path->lowest_level = node->level;
2396 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2397 btrfs_release_path(root, path);
2398 if (ret < 0)
2399 goto out; 2715 goto out;
2400 } else if (root != node->root) { 2716 release = 1;
2401 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2402 } 2717 }
2403 2718
2404 update_processed_blocks(rc, node); 2719 if (root) {
2405 ret = 0; 2720 if (root->ref_cows) {
2721 BUG_ON(node->new_bytenr);
2722 BUG_ON(!list_empty(&node->list));
2723 btrfs_record_root_in_trans(trans, root);
2724 root = root->reloc_root;
2725 node->new_bytenr = root->node->start;
2726 node->root = root;
2727 list_add_tail(&node->list, &rc->backref_cache.changed);
2728 } else {
2729 path->lowest_level = node->level;
2730 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2731 btrfs_release_path(root, path);
2732 if (ret > 0)
2733 ret = 0;
2734 }
2735 if (!ret)
2736 update_processed_blocks(rc, node);
2737 } else {
2738 ret = do_relocation(trans, rc, node, key, path, 1);
2739 }
2406out: 2740out:
2407 drop_node_buffer(node); 2741 if (ret || node->level == 0 || node->cowonly) {
2742 if (release)
2743 release_metadata_space(rc, node);
2744 remove_backref_node(&rc->backref_cache, node);
2745 }
2408 return ret; 2746 return ret;
2409} 2747}
2410 2748
@@ -2415,12 +2753,10 @@ static noinline_for_stack
2415int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2753int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2416 struct reloc_control *rc, struct rb_root *blocks) 2754 struct reloc_control *rc, struct rb_root *blocks)
2417{ 2755{
2418 struct backref_cache *cache;
2419 struct backref_node *node; 2756 struct backref_node *node;
2420 struct btrfs_path *path; 2757 struct btrfs_path *path;
2421 struct tree_block *block; 2758 struct tree_block *block;
2422 struct rb_node *rb_node; 2759 struct rb_node *rb_node;
2423 int level = -1;
2424 int ret; 2760 int ret;
2425 int err = 0; 2761 int err = 0;
2426 2762
@@ -2428,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2428 if (!path) 2764 if (!path)
2429 return -ENOMEM; 2765 return -ENOMEM;
2430 2766
2431 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2432 if (!cache) {
2433 btrfs_free_path(path);
2434 return -ENOMEM;
2435 }
2436
2437 backref_cache_init(cache);
2438
2439 rb_node = rb_first(blocks); 2767 rb_node = rb_first(blocks);
2440 while (rb_node) { 2768 while (rb_node) {
2441 block = rb_entry(rb_node, struct tree_block, rb_node); 2769 block = rb_entry(rb_node, struct tree_block, rb_node);
2442 if (level == -1)
2443 level = block->level;
2444 else
2445 BUG_ON(level != block->level);
2446 if (!block->key_ready) 2770 if (!block->key_ready)
2447 reada_tree_block(rc, block); 2771 reada_tree_block(rc, block);
2448 rb_node = rb_next(rb_node); 2772 rb_node = rb_next(rb_node);
@@ -2460,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2460 while (rb_node) { 2784 while (rb_node) {
2461 block = rb_entry(rb_node, struct tree_block, rb_node); 2785 block = rb_entry(rb_node, struct tree_block, rb_node);
2462 2786
2463 node = build_backref_tree(rc, cache, &block->key, 2787 node = build_backref_tree(rc, &block->key,
2464 block->level, block->bytenr); 2788 block->level, block->bytenr);
2465 if (IS_ERR(node)) { 2789 if (IS_ERR(node)) {
2466 err = PTR_ERR(node); 2790 err = PTR_ERR(node);
@@ -2470,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2470 ret = relocate_tree_block(trans, rc, node, &block->key, 2794 ret = relocate_tree_block(trans, rc, node, &block->key,
2471 path); 2795 path);
2472 if (ret < 0) { 2796 if (ret < 0) {
2473 err = ret; 2797 if (ret != -EAGAIN || rb_node == rb_first(blocks))
2798 err = ret;
2474 goto out; 2799 goto out;
2475 } 2800 }
2476 remove_backref_node(cache, node);
2477 rb_node = rb_next(rb_node); 2801 rb_node = rb_next(rb_node);
2478 } 2802 }
2479 2803out:
2480 if (level > 0)
2481 goto out;
2482
2483 free_block_list(blocks); 2804 free_block_list(blocks);
2805 err = finish_pending_nodes(trans, rc, path, err);
2484 2806
2485 /* 2807 btrfs_free_path(path);
2486 * now backrefs of some upper level tree blocks have been cached, 2808 return err;
2487 * try relocating blocks referenced by these upper level blocks. 2809}
2488 */
2489 while (1) {
2490 struct backref_node *upper = NULL;
2491 if (trans->transaction->in_commit ||
2492 trans->transaction->delayed_refs.flushing)
2493 break;
2494 2810
2495 ret = add_adjacent_blocks(trans, rc, cache, blocks, level, 2811static noinline_for_stack
2496 &upper); 2812int prealloc_file_extent_cluster(struct inode *inode,
2497 if (ret < 0) 2813 struct file_extent_cluster *cluster)
2498 err = ret; 2814{
2499 if (ret != 0) 2815 u64 alloc_hint = 0;
2500 break; 2816 u64 start;
2817 u64 end;
2818 u64 offset = BTRFS_I(inode)->index_cnt;
2819 u64 num_bytes;
2820 int nr = 0;
2821 int ret = 0;
2501 2822
2502 rb_node = rb_first(blocks); 2823 BUG_ON(cluster->start != cluster->boundary[0]);
2503 while (rb_node) { 2824 mutex_lock(&inode->i_mutex);
2504 block = rb_entry(rb_node, struct tree_block, rb_node);
2505 if (trans->transaction->in_commit ||
2506 trans->transaction->delayed_refs.flushing)
2507 goto out;
2508 BUG_ON(!block->key_ready);
2509 node = build_backref_tree(rc, cache, &block->key,
2510 level, block->bytenr);
2511 if (IS_ERR(node)) {
2512 err = PTR_ERR(node);
2513 goto out;
2514 }
2515 2825
2516 ret = relocate_tree_block(trans, rc, node, 2826 ret = btrfs_check_data_free_space(inode, cluster->end +
2517 &block->key, path); 2827 1 - cluster->start);
2518 if (ret < 0) { 2828 if (ret)
2519 err = ret; 2829 goto out;
2520 goto out;
2521 }
2522 remove_backref_node(cache, node);
2523 rb_node = rb_next(rb_node);
2524 }
2525 free_block_list(blocks);
2526 2830
2527 if (upper) { 2831 while (nr < cluster->nr) {
2528 ret = link_to_upper(trans, upper, path); 2832 start = cluster->boundary[nr] - offset;
2529 if (ret < 0) { 2833 if (nr + 1 < cluster->nr)
2530 err = ret; 2834 end = cluster->boundary[nr + 1] - 1 - offset;
2531 break; 2835 else
2532 } 2836 end = cluster->end - offset;
2533 remove_backref_node(cache, upper); 2837
2534 } 2838 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2839 num_bytes = end + 1 - start;
2840 ret = btrfs_prealloc_file_range(inode, 0, start,
2841 num_bytes, num_bytes,
2842 end + 1, &alloc_hint);
2843 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2844 if (ret)
2845 break;
2846 nr++;
2535 } 2847 }
2848 btrfs_free_reserved_data_space(inode, cluster->end +
2849 1 - cluster->start);
2536out: 2850out:
2537 free_block_list(blocks); 2851 mutex_unlock(&inode->i_mutex);
2538 2852 return ret;
2539 ret = finish_pending_nodes(trans, cache, path);
2540 if (ret < 0)
2541 err = ret;
2542
2543 kfree(cache);
2544 btrfs_free_path(path);
2545 return err;
2546} 2853}
2547 2854
2548static noinline_for_stack 2855static noinline_for_stack
@@ -2588,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
2588 u64 offset = BTRFS_I(inode)->index_cnt; 2895 u64 offset = BTRFS_I(inode)->index_cnt;
2589 unsigned long index; 2896 unsigned long index;
2590 unsigned long last_index; 2897 unsigned long last_index;
2591 unsigned int dirty_page = 0;
2592 struct page *page; 2898 struct page *page;
2593 struct file_ra_state *ra; 2899 struct file_ra_state *ra;
2594 int nr = 0; 2900 int nr = 0;
@@ -2601,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
2601 if (!ra) 2907 if (!ra)
2602 return -ENOMEM; 2908 return -ENOMEM;
2603 2909
2604 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2910 ret = prealloc_file_extent_cluster(inode, cluster);
2605 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2911 if (ret)
2912 goto out;
2606 2913
2607 mutex_lock(&inode->i_mutex); 2914 file_ra_state_init(ra, inode->i_mapping);
2608 2915
2609 i_size_write(inode, cluster->end + 1 - offset);
2610 ret = setup_extent_mapping(inode, cluster->start - offset, 2916 ret = setup_extent_mapping(inode, cluster->start - offset,
2611 cluster->end - offset, cluster->start); 2917 cluster->end - offset, cluster->start);
2612 if (ret) 2918 if (ret)
2613 goto out_unlock; 2919 goto out;
2614
2615 file_ra_state_init(ra, inode->i_mapping);
2616 2920
2617 WARN_ON(cluster->start != cluster->boundary[0]); 2921 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2922 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2618 while (index <= last_index) { 2923 while (index <= last_index) {
2924 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2925 if (ret)
2926 goto out;
2927
2619 page = find_lock_page(inode->i_mapping, index); 2928 page = find_lock_page(inode->i_mapping, index);
2620 if (!page) { 2929 if (!page) {
2621 page_cache_sync_readahead(inode->i_mapping, 2930 page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2623 last_index + 1 - index); 2932 last_index + 1 - index);
2624 page = grab_cache_page(inode->i_mapping, index); 2933 page = grab_cache_page(inode->i_mapping, index);
2625 if (!page) { 2934 if (!page) {
2935 btrfs_delalloc_release_metadata(inode,
2936 PAGE_CACHE_SIZE);
2626 ret = -ENOMEM; 2937 ret = -ENOMEM;
2627 goto out_unlock; 2938 goto out;
2628 } 2939 }
2629 } 2940 }
2630 2941
@@ -2640,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2640 if (!PageUptodate(page)) { 2951 if (!PageUptodate(page)) {
2641 unlock_page(page); 2952 unlock_page(page);
2642 page_cache_release(page); 2953 page_cache_release(page);
2954 btrfs_delalloc_release_metadata(inode,
2955 PAGE_CACHE_SIZE);
2643 ret = -EIO; 2956 ret = -EIO;
2644 goto out_unlock; 2957 goto out;
2645 } 2958 }
2646 } 2959 }
2647 2960
@@ -2660,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2660 EXTENT_BOUNDARY, GFP_NOFS); 2973 EXTENT_BOUNDARY, GFP_NOFS);
2661 nr++; 2974 nr++;
2662 } 2975 }
2663 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 2976
2977 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2665 set_page_dirty(page); 2978 set_page_dirty(page);
2666 dirty_page++;
2667 2979
2668 unlock_extent(&BTRFS_I(inode)->io_tree, 2980 unlock_extent(&BTRFS_I(inode)->io_tree,
2669 page_start, page_end, GFP_NOFS); 2981 page_start, page_end, GFP_NOFS);
@@ -2671,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
2671 page_cache_release(page); 2983 page_cache_release(page);
2672 2984
2673 index++; 2985 index++;
2674 if (nr < cluster->nr && 2986 balance_dirty_pages_ratelimited(inode->i_mapping);
2675 page_end + 1 + offset == cluster->boundary[nr]) { 2987 btrfs_throttle(BTRFS_I(inode)->root);
2676 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2677 dirty_page);
2678 dirty_page = 0;
2679 }
2680 }
2681 if (dirty_page) {
2682 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2683 dirty_page);
2684 } 2988 }
2685 WARN_ON(nr != cluster->nr); 2989 WARN_ON(nr != cluster->nr);
2686out_unlock: 2990out:
2687 mutex_unlock(&inode->i_mutex);
2688 kfree(ra); 2991 kfree(ra);
2689 return ret; 2992 return ret;
2690} 2993}
@@ -2870,9 +3173,6 @@ out:
2870static int block_use_full_backref(struct reloc_control *rc, 3173static int block_use_full_backref(struct reloc_control *rc,
2871 struct extent_buffer *eb) 3174 struct extent_buffer *eb)
2872{ 3175{
2873 struct btrfs_path *path;
2874 struct btrfs_extent_item *ei;
2875 struct btrfs_key key;
2876 u64 flags; 3176 u64 flags;
2877 int ret; 3177 int ret;
2878 3178
@@ -2880,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc,
2880 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3180 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2881 return 1; 3181 return 1;
2882 3182
2883 path = btrfs_alloc_path(); 3183 ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
2884 BUG_ON(!path); 3184 eb->start, eb->len, NULL, &flags);
2885
2886 key.objectid = eb->start;
2887 key.type = BTRFS_EXTENT_ITEM_KEY;
2888 key.offset = eb->len;
2889
2890 path->search_commit_root = 1;
2891 path->skip_locking = 1;
2892 ret = btrfs_search_slot(NULL, rc->extent_root,
2893 &key, path, 0, 0);
2894 BUG_ON(ret); 3185 BUG_ON(ret);
2895 3186
2896 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2897 struct btrfs_extent_item);
2898 flags = btrfs_extent_flags(path->nodes[0], ei);
2899 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2900 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3187 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2901 ret = 1; 3188 ret = 1;
2902 else 3189 else
2903 ret = 0; 3190 ret = 0;
2904 btrfs_free_path(path);
2905 return ret; 3191 return ret;
2906} 3192}
2907 3193
@@ -3074,22 +3360,10 @@ int add_data_references(struct reloc_control *rc,
3074 struct btrfs_extent_inline_ref *iref; 3360 struct btrfs_extent_inline_ref *iref;
3075 unsigned long ptr; 3361 unsigned long ptr;
3076 unsigned long end; 3362 unsigned long end;
3077 u32 blocksize; 3363 u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3078 int ret; 3364 int ret;
3079 int err = 0; 3365 int err = 0;
3080 3366
3081 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3082 extent_key->offset);
3083 BUG_ON(ret < 0);
3084 if (ret > 0) {
3085 /* the relocated data is fragmented */
3086 rc->extents_skipped++;
3087 btrfs_release_path(rc->extent_root, path);
3088 return 0;
3089 }
3090
3091 blocksize = btrfs_level_size(rc->extent_root, 0);
3092
3093 eb = path->nodes[0]; 3367 eb = path->nodes[0];
3094 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3368 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3095 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3369 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3444,8 @@ int add_data_references(struct reloc_control *rc,
3170 */ 3444 */
3171static noinline_for_stack 3445static noinline_for_stack
3172int find_next_extent(struct btrfs_trans_handle *trans, 3446int find_next_extent(struct btrfs_trans_handle *trans,
3173 struct reloc_control *rc, struct btrfs_path *path) 3447 struct reloc_control *rc, struct btrfs_path *path,
3448 struct btrfs_key *extent_key)
3174{ 3449{
3175 struct btrfs_key key; 3450 struct btrfs_key key;
3176 struct extent_buffer *leaf; 3451 struct extent_buffer *leaf;
@@ -3225,6 +3500,7 @@ next:
3225 rc->search_start = end + 1; 3500 rc->search_start = end + 1;
3226 } else { 3501 } else {
3227 rc->search_start = key.objectid + key.offset; 3502 rc->search_start = key.objectid + key.offset;
3503 memcpy(extent_key, &key, sizeof(key));
3228 return 0; 3504 return 0;
3229 } 3505 }
3230 } 3506 }
@@ -3262,12 +3538,49 @@ static int check_extent_flags(u64 flags)
3262 return 0; 3538 return 0;
3263} 3539}
3264 3540
3541static noinline_for_stack
3542int prepare_to_relocate(struct reloc_control *rc)
3543{
3544 struct btrfs_trans_handle *trans;
3545 int ret;
3546
3547 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
3548 if (!rc->block_rsv)
3549 return -ENOMEM;
3550
3551 /*
3552 * reserve some space for creating reloc trees.
3553 * btrfs_init_reloc_root will use them when there
3554 * is no reservation in transaction handle.
3555 */
3556 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3557 rc->extent_root->nodesize * 256,
3558 &rc->block_rsv_retries);
3559 if (ret)
3560 return ret;
3561
3562 rc->block_rsv->refill_used = 1;
3563 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3564
3565 memset(&rc->cluster, 0, sizeof(rc->cluster));
3566 rc->search_start = rc->block_group->key.objectid;
3567 rc->extents_found = 0;
3568 rc->nodes_relocated = 0;
3569 rc->merging_rsv_size = 0;
3570 rc->block_rsv_retries = 0;
3571
3572 rc->create_reloc_tree = 1;
3573 set_reloc_control(rc);
3574
3575 trans = btrfs_join_transaction(rc->extent_root, 1);
3576 btrfs_commit_transaction(trans, rc->extent_root);
3577 return 0;
3578}
3265 3579
3266static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3580static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3267{ 3581{
3268 struct rb_root blocks = RB_ROOT; 3582 struct rb_root blocks = RB_ROOT;
3269 struct btrfs_key key; 3583 struct btrfs_key key;
3270 struct file_extent_cluster *cluster;
3271 struct btrfs_trans_handle *trans = NULL; 3584 struct btrfs_trans_handle *trans = NULL;
3272 struct btrfs_path *path; 3585 struct btrfs_path *path;
3273 struct btrfs_extent_item *ei; 3586 struct btrfs_extent_item *ei;
@@ -3277,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3277 int ret; 3590 int ret;
3278 int err = 0; 3591 int err = 0;
3279 3592
3280 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3281 if (!cluster)
3282 return -ENOMEM;
3283
3284 path = btrfs_alloc_path(); 3593 path = btrfs_alloc_path();
3285 if (!path) { 3594 if (!path)
3286 kfree(cluster);
3287 return -ENOMEM; 3595 return -ENOMEM;
3288 }
3289
3290 rc->extents_found = 0;
3291 rc->extents_skipped = 0;
3292
3293 rc->search_start = rc->block_group->key.objectid;
3294 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3295 GFP_NOFS);
3296
3297 rc->create_reloc_root = 1;
3298 set_reloc_control(rc);
3299 3596
3300 trans = btrfs_start_transaction(rc->extent_root, 1); 3597 ret = prepare_to_relocate(rc);
3301 btrfs_commit_transaction(trans, rc->extent_root); 3598 if (ret) {
3599 err = ret;
3600 goto out_free;
3601 }
3302 3602
3303 while (1) { 3603 while (1) {
3304 trans = btrfs_start_transaction(rc->extent_root, 1); 3604 trans = btrfs_start_transaction(rc->extent_root, 0);
3605
3606 if (update_backref_cache(trans, &rc->backref_cache)) {
3607 btrfs_end_transaction(trans, rc->extent_root);
3608 continue;
3609 }
3305 3610
3306 ret = find_next_extent(trans, rc, path); 3611 ret = find_next_extent(trans, rc, path, &key);
3307 if (ret < 0) 3612 if (ret < 0)
3308 err = ret; 3613 err = ret;
3309 if (ret != 0) 3614 if (ret != 0)
@@ -3313,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3313 3618
3314 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3619 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3315 struct btrfs_extent_item); 3620 struct btrfs_extent_item);
3316 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3621 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3317 item_size = btrfs_item_size_nr(path->nodes[0],
3318 path->slots[0]);
3319 if (item_size >= sizeof(*ei)) { 3622 if (item_size >= sizeof(*ei)) {
3320 flags = btrfs_extent_flags(path->nodes[0], ei); 3623 flags = btrfs_extent_flags(path->nodes[0], ei);
3321 ret = check_extent_flags(flags); 3624 ret = check_extent_flags(flags);
@@ -3356,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3356 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3659 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3357 ret = add_tree_block(rc, &key, path, &blocks); 3660 ret = add_tree_block(rc, &key, path, &blocks);
3358 } else if (rc->stage == UPDATE_DATA_PTRS && 3661 } else if (rc->stage == UPDATE_DATA_PTRS &&
3359 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3662 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3360 ret = add_data_references(rc, &key, path, &blocks); 3663 ret = add_data_references(rc, &key, path, &blocks);
3361 } else { 3664 } else {
3362 btrfs_release_path(rc->extent_root, path); 3665 btrfs_release_path(rc->extent_root, path);
3363 ret = 0; 3666 ret = 0;
3364 } 3667 }
3365 if (ret < 0) { 3668 if (ret < 0) {
3366 err = 0; 3669 err = ret;
3367 break; 3670 break;
3368 } 3671 }
3369 3672
3370 if (!RB_EMPTY_ROOT(&blocks)) { 3673 if (!RB_EMPTY_ROOT(&blocks)) {
3371 ret = relocate_tree_blocks(trans, rc, &blocks); 3674 ret = relocate_tree_blocks(trans, rc, &blocks);
3372 if (ret < 0) { 3675 if (ret < 0) {
3676 if (ret != -EAGAIN) {
3677 err = ret;
3678 break;
3679 }
3680 rc->extents_found--;
3681 rc->search_start = key.objectid;
3682 }
3683 }
3684
3685 ret = btrfs_block_rsv_check(trans, rc->extent_root,
3686 rc->block_rsv, 0, 5);
3687 if (ret < 0) {
3688 if (ret != -EAGAIN) {
3373 err = ret; 3689 err = ret;
3690 WARN_ON(1);
3374 break; 3691 break;
3375 } 3692 }
3693 rc->commit_transaction = 1;
3376 } 3694 }
3377 3695
3378 nr = trans->blocks_used; 3696 if (rc->commit_transaction) {
3379 btrfs_end_transaction(trans, rc->extent_root); 3697 rc->commit_transaction = 0;
3698 ret = btrfs_commit_transaction(trans, rc->extent_root);
3699 BUG_ON(ret);
3700 } else {
3701 nr = trans->blocks_used;
3702 btrfs_end_transaction_throttle(trans, rc->extent_root);
3703 btrfs_btree_balance_dirty(rc->extent_root, nr);
3704 }
3380 trans = NULL; 3705 trans = NULL;
3381 btrfs_btree_balance_dirty(rc->extent_root, nr);
3382 3706
3383 if (rc->stage == MOVE_DATA_EXTENTS && 3707 if (rc->stage == MOVE_DATA_EXTENTS &&
3384 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3708 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3385 rc->found_file_extent = 1; 3709 rc->found_file_extent = 1;
3386 ret = relocate_data_extent(rc->data_inode, 3710 ret = relocate_data_extent(rc->data_inode,
3387 &key, cluster); 3711 &key, &rc->cluster);
3388 if (ret < 0) { 3712 if (ret < 0) {
3389 err = ret; 3713 err = ret;
3390 break; 3714 break;
3391 } 3715 }
3392 } 3716 }
3393 } 3717 }
3394 btrfs_free_path(path); 3718
3719 btrfs_release_path(rc->extent_root, path);
3720 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3721 GFP_NOFS);
3395 3722
3396 if (trans) { 3723 if (trans) {
3397 nr = trans->blocks_used; 3724 nr = trans->blocks_used;
3398 btrfs_end_transaction(trans, rc->extent_root); 3725 btrfs_end_transaction_throttle(trans, rc->extent_root);
3399 btrfs_btree_balance_dirty(rc->extent_root, nr); 3726 btrfs_btree_balance_dirty(rc->extent_root, nr);
3400 } 3727 }
3401 3728
3402 if (!err) { 3729 if (!err) {
3403 ret = relocate_file_extent_cluster(rc->data_inode, cluster); 3730 ret = relocate_file_extent_cluster(rc->data_inode,
3731 &rc->cluster);
3404 if (ret < 0) 3732 if (ret < 0)
3405 err = ret; 3733 err = ret;
3406 } 3734 }
3407 3735
3408 kfree(cluster); 3736 rc->create_reloc_tree = 0;
3737 set_reloc_control(rc);
3409 3738
3410 rc->create_reloc_root = 0; 3739 backref_cache_cleanup(&rc->backref_cache);
3411 smp_mb(); 3740 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3412 3741
3413 if (rc->extents_found > 0) { 3742 err = prepare_to_merge(rc, err);
3414 trans = btrfs_start_transaction(rc->extent_root, 1);
3415 btrfs_commit_transaction(trans, rc->extent_root);
3416 }
3417 3743
3418 merge_reloc_roots(rc); 3744 merge_reloc_roots(rc);
3419 3745
3746 rc->merge_reloc_tree = 0;
3420 unset_reloc_control(rc); 3747 unset_reloc_control(rc);
3748 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3421 3749
3422 /* get rid of pinned extents */ 3750 /* get rid of pinned extents */
3423 trans = btrfs_start_transaction(rc->extent_root, 1); 3751 trans = btrfs_join_transaction(rc->extent_root, 1);
3424 btrfs_commit_transaction(trans, rc->extent_root); 3752 btrfs_commit_transaction(trans, rc->extent_root);
3425 3753out_free:
3754 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3755 btrfs_free_path(path);
3426 return err; 3756 return err;
3427} 3757}
3428 3758
@@ -3448,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3448 btrfs_set_inode_generation(leaf, item, 1); 3778 btrfs_set_inode_generation(leaf, item, 1);
3449 btrfs_set_inode_size(leaf, item, 0); 3779 btrfs_set_inode_size(leaf, item, 0);
3450 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3780 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3451 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3781 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3782 BTRFS_INODE_PREALLOC);
3452 btrfs_mark_buffer_dirty(leaf); 3783 btrfs_mark_buffer_dirty(leaf);
3453 btrfs_release_path(root, path); 3784 btrfs_release_path(root, path);
3454out: 3785out:
@@ -3460,8 +3791,9 @@ out:
3460 * helper to create inode for data relocation. 3791 * helper to create inode for data relocation.
3461 * the inode is in data relocation tree and its link count is 0 3792 * the inode is in data relocation tree and its link count is 0
3462 */ 3793 */
3463static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3794static noinline_for_stack
3464 struct btrfs_block_group_cache *group) 3795struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3796 struct btrfs_block_group_cache *group)
3465{ 3797{
3466 struct inode *inode = NULL; 3798 struct inode *inode = NULL;
3467 struct btrfs_trans_handle *trans; 3799 struct btrfs_trans_handle *trans;
@@ -3475,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3475 if (IS_ERR(root)) 3807 if (IS_ERR(root))
3476 return ERR_CAST(root); 3808 return ERR_CAST(root);
3477 3809
3478 trans = btrfs_start_transaction(root, 1); 3810 trans = btrfs_start_transaction(root, 6);
3479 BUG_ON(!trans); 3811 if (IS_ERR(trans))
3812 return ERR_CAST(trans);
3480 3813
3481 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3814 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3482 if (err) 3815 if (err)
@@ -3496,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3496out: 3829out:
3497 nr = trans->blocks_used; 3830 nr = trans->blocks_used;
3498 btrfs_end_transaction(trans, root); 3831 btrfs_end_transaction(trans, root);
3499
3500 btrfs_btree_balance_dirty(root, nr); 3832 btrfs_btree_balance_dirty(root, nr);
3501 if (err) { 3833 if (err) {
3502 if (inode) 3834 if (inode)
@@ -3506,6 +3838,21 @@ out:
3506 return inode; 3838 return inode;
3507} 3839}
3508 3840
3841static struct reloc_control *alloc_reloc_control(void)
3842{
3843 struct reloc_control *rc;
3844
3845 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3846 if (!rc)
3847 return NULL;
3848
3849 INIT_LIST_HEAD(&rc->reloc_roots);
3850 backref_cache_init(&rc->backref_cache);
3851 mapping_tree_init(&rc->reloc_root_tree);
3852 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3853 return rc;
3854}
3855
3509/* 3856/*
3510 * function to relocate all extents in a block group. 3857 * function to relocate all extents in a block group.
3511 */ 3858 */
@@ -3514,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3514 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3861 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3515 struct reloc_control *rc; 3862 struct reloc_control *rc;
3516 int ret; 3863 int ret;
3864 int rw = 0;
3517 int err = 0; 3865 int err = 0;
3518 3866
3519 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3867 rc = alloc_reloc_control();
3520 if (!rc) 3868 if (!rc)
3521 return -ENOMEM; 3869 return -ENOMEM;
3522 3870
3523 mapping_tree_init(&rc->reloc_root_tree); 3871 rc->extent_root = extent_root;
3524 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3525 INIT_LIST_HEAD(&rc->reloc_roots);
3526 3872
3527 rc->block_group = btrfs_lookup_block_group(fs_info, group_start); 3873 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3528 BUG_ON(!rc->block_group); 3874 BUG_ON(!rc->block_group);
3529 3875
3530 btrfs_init_workers(&rc->workers, "relocate", 3876 if (!rc->block_group->ro) {
3531 fs_info->thread_pool_size, NULL); 3877 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3532 3878 if (ret) {
3533 rc->extent_root = extent_root; 3879 err = ret;
3534 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3880 goto out;
3881 }
3882 rw = 1;
3883 }
3535 3884
3536 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3885 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3537 if (IS_ERR(rc->data_inode)) { 3886 if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3548 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 3897 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3549 3898
3550 while (1) { 3899 while (1) {
3551 rc->extents_found = 0;
3552 rc->extents_skipped = 0;
3553
3554 mutex_lock(&fs_info->cleaner_mutex); 3900 mutex_lock(&fs_info->cleaner_mutex);
3555 3901
3556 btrfs_clean_old_snapshots(fs_info->tree_root); 3902 btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3559 mutex_unlock(&fs_info->cleaner_mutex); 3905 mutex_unlock(&fs_info->cleaner_mutex);
3560 if (ret < 0) { 3906 if (ret < 0) {
3561 err = ret; 3907 err = ret;
3562 break; 3908 goto out;
3563 } 3909 }
3564 3910
3565 if (rc->extents_found == 0) 3911 if (rc->extents_found == 0)
@@ -3573,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3573 invalidate_mapping_pages(rc->data_inode->i_mapping, 3919 invalidate_mapping_pages(rc->data_inode->i_mapping,
3574 0, -1); 3920 0, -1);
3575 rc->stage = UPDATE_DATA_PTRS; 3921 rc->stage = UPDATE_DATA_PTRS;
3576 } else if (rc->stage == UPDATE_DATA_PTRS &&
3577 rc->extents_skipped >= rc->extents_found) {
3578 iput(rc->data_inode);
3579 rc->data_inode = create_reloc_inode(fs_info,
3580 rc->block_group);
3581 if (IS_ERR(rc->data_inode)) {
3582 err = PTR_ERR(rc->data_inode);
3583 rc->data_inode = NULL;
3584 break;
3585 }
3586 rc->stage = MOVE_DATA_EXTENTS;
3587 rc->found_file_extent = 0;
3588 } 3922 }
3589 } 3923 }
3590 3924
@@ -3597,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3597 WARN_ON(rc->block_group->reserved > 0); 3931 WARN_ON(rc->block_group->reserved > 0);
3598 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 3932 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3599out: 3933out:
3934 if (err && rw)
3935 btrfs_set_block_group_rw(extent_root, rc->block_group);
3600 iput(rc->data_inode); 3936 iput(rc->data_inode);
3601 btrfs_stop_workers(&rc->workers);
3602 btrfs_put_block_group(rc->block_group); 3937 btrfs_put_block_group(rc->block_group);
3603 kfree(rc); 3938 kfree(rc);
3604 return err; 3939 return err;
@@ -3609,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3609 struct btrfs_trans_handle *trans; 3944 struct btrfs_trans_handle *trans;
3610 int ret; 3945 int ret;
3611 3946
3612 trans = btrfs_start_transaction(root->fs_info->tree_root, 1); 3947 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
3613 3948
3614 memset(&root->root_item.drop_progress, 0, 3949 memset(&root->root_item.drop_progress, 0,
3615 sizeof(root->root_item.drop_progress)); 3950 sizeof(root->root_item.drop_progress));
@@ -3702,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3702 if (list_empty(&reloc_roots)) 4037 if (list_empty(&reloc_roots))
3703 goto out; 4038 goto out;
3704 4039
3705 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4040 rc = alloc_reloc_control();
3706 if (!rc) { 4041 if (!rc) {
3707 err = -ENOMEM; 4042 err = -ENOMEM;
3708 goto out; 4043 goto out;
3709 } 4044 }
3710 4045
3711 mapping_tree_init(&rc->reloc_root_tree);
3712 INIT_LIST_HEAD(&rc->reloc_roots);
3713 btrfs_init_workers(&rc->workers, "relocate",
3714 root->fs_info->thread_pool_size, NULL);
3715 rc->extent_root = root->fs_info->extent_root; 4046 rc->extent_root = root->fs_info->extent_root;
3716 4047
3717 set_reloc_control(rc); 4048 set_reloc_control(rc);
3718 4049
4050 trans = btrfs_join_transaction(rc->extent_root, 1);
4051
4052 rc->merge_reloc_tree = 1;
4053
3719 while (!list_empty(&reloc_roots)) { 4054 while (!list_empty(&reloc_roots)) {
3720 reloc_root = list_entry(reloc_roots.next, 4055 reloc_root = list_entry(reloc_roots.next,
3721 struct btrfs_root, root_list); 4056 struct btrfs_root, root_list);
@@ -3735,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3735 fs_root->reloc_root = reloc_root; 4070 fs_root->reloc_root = reloc_root;
3736 } 4071 }
3737 4072
3738 trans = btrfs_start_transaction(rc->extent_root, 1);
3739 btrfs_commit_transaction(trans, rc->extent_root); 4073 btrfs_commit_transaction(trans, rc->extent_root);
3740 4074
3741 merge_reloc_roots(rc); 4075 merge_reloc_roots(rc);
3742 4076
3743 unset_reloc_control(rc); 4077 unset_reloc_control(rc);
3744 4078
3745 trans = btrfs_start_transaction(rc->extent_root, 1); 4079 trans = btrfs_join_transaction(rc->extent_root, 1);
3746 btrfs_commit_transaction(trans, rc->extent_root); 4080 btrfs_commit_transaction(trans, rc->extent_root);
3747out: 4081out:
3748 if (rc) { 4082 kfree(rc);
3749 btrfs_stop_workers(&rc->workers);
3750 kfree(rc);
3751 }
3752 while (!list_empty(&reloc_roots)) { 4083 while (!list_empty(&reloc_roots)) {
3753 reloc_root = list_entry(reloc_roots.next, 4084 reloc_root = list_entry(reloc_roots.next,
3754 struct btrfs_root, root_list); 4085 struct btrfs_root, root_list);
@@ -3814,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3814 btrfs_put_ordered_extent(ordered); 4145 btrfs_put_ordered_extent(ordered);
3815 return 0; 4146 return 0;
3816} 4147}
4148
4149void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4150 struct btrfs_root *root, struct extent_buffer *buf,
4151 struct extent_buffer *cow)
4152{
4153 struct reloc_control *rc;
4154 struct backref_node *node;
4155 int first_cow = 0;
4156 int level;
4157 int ret;
4158
4159 rc = root->fs_info->reloc_ctl;
4160 if (!rc)
4161 return;
4162
4163 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4164 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4165
4166 level = btrfs_header_level(buf);
4167 if (btrfs_header_generation(buf) <=
4168 btrfs_root_last_snapshot(&root->root_item))
4169 first_cow = 1;
4170
4171 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4172 rc->create_reloc_tree) {
4173 WARN_ON(!first_cow && level == 0);
4174
4175 node = rc->backref_cache.path[level];
4176 BUG_ON(node->bytenr != buf->start &&
4177 node->new_bytenr != buf->start);
4178
4179 drop_node_buffer(node);
4180 extent_buffer_get(cow);
4181 node->eb = cow;
4182 node->new_bytenr = cow->start;
4183
4184 if (!node->pending) {
4185 list_move_tail(&node->list,
4186 &rc->backref_cache.pending[level]);
4187 node->pending = 1;
4188 }
4189
4190 if (first_cow)
4191 __mark_block_processed(rc, node);
4192
4193 if (first_cow && level > 0)
4194 rc->nodes_relocated += buf->len;
4195 }
4196
4197 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4198 ret = replace_file_extents(trans, rc, root, cow);
4199 BUG_ON(ret);
4200 }
4201}
4202
4203/*
4204 * called before creating snapshot. it calculates metadata reservation
4205 * requried for relocating tree blocks in the snapshot
4206 */
4207void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4208 struct btrfs_pending_snapshot *pending,
4209 u64 *bytes_to_reserve)
4210{
4211 struct btrfs_root *root;
4212 struct reloc_control *rc;
4213
4214 root = pending->root;
4215 if (!root->reloc_root)
4216 return;
4217
4218 rc = root->fs_info->reloc_ctl;
4219 if (!rc->merge_reloc_tree)
4220 return;
4221
4222 root = root->reloc_root;
4223 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4224 /*
4225 * relocation is in the stage of merging trees. the space
4226 * used by merging a reloc tree is twice the size of
4227 * relocated tree nodes in the worst case. half for cowing
4228 * the reloc tree, half for cowing the fs tree. the space
4229 * used by cowing the reloc tree will be freed after the
4230 * tree is dropped. if we create snapshot, cowing the fs
4231 * tree may use more space than it frees. so we need
4232 * reserve extra space.
4233 */
4234 *bytes_to_reserve += rc->nodes_relocated;
4235}
4236
4237/*
4238 * called after snapshot is created. migrate block reservation
4239 * and create reloc root for the newly created snapshot
4240 */
4241void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4242 struct btrfs_pending_snapshot *pending)
4243{
4244 struct btrfs_root *root = pending->root;
4245 struct btrfs_root *reloc_root;
4246 struct btrfs_root *new_root;
4247 struct reloc_control *rc;
4248 int ret;
4249
4250 if (!root->reloc_root)
4251 return;
4252
4253 rc = root->fs_info->reloc_ctl;
4254 rc->merging_rsv_size += rc->nodes_relocated;
4255
4256 if (rc->merge_reloc_tree) {
4257 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4258 rc->block_rsv,
4259 rc->nodes_relocated);
4260 BUG_ON(ret);
4261 }
4262
4263 new_root = pending->snap;
4264 reloc_root = create_reloc_root(trans, root->reloc_root,
4265 new_root->root_key.objectid);
4266
4267 __add_reloc_root(reloc_root);
4268 new_root->reloc_root = reloc_root;
4269
4270 if (rc->create_reloc_tree) {
4271 ret = clone_backref_node(trans, rc, root, reloc_root);
4272 BUG_ON(ret);
4273 }
4274}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259 struct extent_buffer *leaf; 259 struct extent_buffer *leaf;
260 struct btrfs_path *path; 260 struct btrfs_path *path;
261 struct btrfs_key key; 261 struct btrfs_key key;
262 struct btrfs_key root_key;
263 struct btrfs_root *root;
262 int err = 0; 264 int err = 0;
263 int ret; 265 int ret;
264 266
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
270 key.type = BTRFS_ORPHAN_ITEM_KEY; 272 key.type = BTRFS_ORPHAN_ITEM_KEY;
271 key.offset = 0; 273 key.offset = 0;
272 274
275 root_key.type = BTRFS_ROOT_ITEM_KEY;
276 root_key.offset = (u64)-1;
277
273 while (1) { 278 while (1) {
274 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 279 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
275 if (ret < 0) { 280 if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
294 key.type != BTRFS_ORPHAN_ITEM_KEY) 299 key.type != BTRFS_ORPHAN_ITEM_KEY)
295 break; 300 break;
296 301
297 ret = btrfs_find_dead_roots(tree_root, key.offset); 302 root_key.objectid = key.offset;
298 if (ret) { 303 key.offset++;
304
305 root = btrfs_read_fs_root_no_name(tree_root->fs_info,
306 &root_key);
307 if (!IS_ERR(root))
308 continue;
309
310 ret = PTR_ERR(root);
311 if (ret != -ENOENT) {
299 err = ret; 312 err = ret;
300 break; 313 break;
301 } 314 }
302 315
303 key.offset++; 316 ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
317 if (ret) {
318 err = ret;
319 break;
320 }
304 } 321 }
305 322
306 btrfs_free_path(path); 323 btrfs_free_path(path);
@@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
313{ 330{
314 struct btrfs_path *path; 331 struct btrfs_path *path;
315 int ret; 332 int ret;
316 u32 refs;
317 struct btrfs_root_item *ri; 333 struct btrfs_root_item *ri;
318 struct extent_buffer *leaf; 334 struct extent_buffer *leaf;
319 335
@@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
327 leaf = path->nodes[0]; 343 leaf = path->nodes[0];
328 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); 344 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
329 345
330 refs = btrfs_disk_root_refs(leaf, ri);
331 BUG_ON(refs != 0);
332 ret = btrfs_del_item(trans, root, path); 346 ret = btrfs_del_item(trans, root, path);
333out: 347out:
334 btrfs_free_path(path); 348 btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2909a03e5230..f2393b390318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb,
360 */ 360 */
361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
363 if (IS_ERR(di))
364 return ERR_CAST(di);
363 if (!di) { 365 if (!di) {
364 /* 366 /*
365 * Ok the default dir item isn't there. This is weird since 367 * Ok the default dir item isn't there. This is weird since
@@ -390,8 +392,8 @@ setup_root:
390 location.offset = 0; 392 location.offset = 0;
391 393
392 inode = btrfs_iget(sb, &location, new_root, &new); 394 inode = btrfs_iget(sb, &location, new_root, &new);
393 if (!inode) 395 if (IS_ERR(inode))
394 return ERR_PTR(-ENOMEM); 396 return ERR_CAST(inode);
395 397
396 /* 398 /*
397 * If we're just mounting the root most subvol put the inode and return 399 * If we're just mounting the root most subvol put the inode and return
@@ -498,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
498 btrfs_start_delalloc_inodes(root, 0); 500 btrfs_start_delalloc_inodes(root, 0);
499 btrfs_wait_ordered_extents(root, 0, 0); 501 btrfs_wait_ordered_extents(root, 0, 0);
500 502
501 trans = btrfs_start_transaction(root, 1); 503 trans = btrfs_start_transaction(root, 0);
502 ret = btrfs_commit_transaction(trans, root); 504 ret = btrfs_commit_transaction(trans, root);
503 return ret; 505 return ret;
504} 506}
@@ -694,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
694 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 696 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
695 return -EINVAL; 697 return -EINVAL;
696 698
697 /* recover relocation */ 699 ret = btrfs_cleanup_fs_roots(root->fs_info);
698 ret = btrfs_recover_relocation(root);
699 WARN_ON(ret); 700 WARN_ON(ret);
700 701
701 ret = btrfs_cleanup_fs_roots(root->fs_info); 702 /* recover relocation */
703 ret = btrfs_recover_relocation(root);
702 WARN_ON(ret); 704 WARN_ON(ret);
703 705
704 sb->s_flags &= ~MS_RDONLY; 706 sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +716,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
714 struct list_head *head = &root->fs_info->space_info; 716 struct list_head *head = &root->fs_info->space_info;
715 struct btrfs_space_info *found; 717 struct btrfs_space_info *found;
716 u64 total_used = 0; 718 u64 total_used = 0;
717 u64 data_used = 0;
718 int bits = dentry->d_sb->s_blocksize_bits; 719 int bits = dentry->d_sb->s_blocksize_bits;
719 __be32 *fsid = (__be32 *)root->fs_info->fsid; 720 __be32 *fsid = (__be32 *)root->fs_info->fsid;
720 721
721 rcu_read_lock(); 722 rcu_read_lock();
722 list_for_each_entry_rcu(found, head, list) { 723 list_for_each_entry_rcu(found, head, list)
723 if (found->flags & (BTRFS_BLOCK_GROUP_DUP| 724 total_used += found->disk_used;
724 BTRFS_BLOCK_GROUP_RAID10|
725 BTRFS_BLOCK_GROUP_RAID1)) {
726 total_used += found->bytes_used;
727 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
728 data_used += found->bytes_used;
729 else
730 data_used += found->total_bytes;
731 }
732
733 total_used += found->bytes_used;
734 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
735 data_used += found->bytes_used;
736 else
737 data_used += found->total_bytes;
738 }
739 rcu_read_unlock(); 725 rcu_read_unlock();
740 726
741 buf->f_namelen = BTRFS_NAME_LEN; 727 buf->f_namelen = BTRFS_NAME_LEN;
742 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 728 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
743 buf->f_bfree = buf->f_blocks - (total_used >> bits); 729 buf->f_bfree = buf->f_blocks - (total_used >> bits);
744 buf->f_bavail = buf->f_blocks - (data_used >> bits); 730 buf->f_bavail = buf->f_bfree;
745 buf->f_bsize = dentry->d_sb->s_blocksize; 731 buf->f_bsize = dentry->d_sb->s_blocksize;
746 buf->f_type = BTRFS_SUPER_MAGIC; 732 buf->f_type = BTRFS_SUPER_MAGIC;
747 733
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
165 TRANS_USERSPACE, 165 TRANS_USERSPACE,
166}; 166};
167 167
168static int may_wait_transaction(struct btrfs_root *root, int type)
169{
170 if (!root->fs_info->log_root_recovering &&
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172 type == TRANS_USERSPACE))
173 return 1;
174 return 0;
175}
176
168static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 177static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
169 int num_blocks, int type) 178 u64 num_items, int type)
170{ 179{
171 struct btrfs_trans_handle *h = 180 struct btrfs_trans_handle *h;
172 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
173 int ret; 183 int ret;
184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h)
187 return ERR_PTR(-ENOMEM);
174 188
175 mutex_lock(&root->fs_info->trans_mutex); 189 mutex_lock(&root->fs_info->trans_mutex);
176 if (!root->fs_info->log_root_recovering && 190 if (may_wait_transaction(root, type))
177 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
178 type == TRANS_USERSPACE))
179 wait_current_trans(root); 191 wait_current_trans(root);
192
180 ret = join_transaction(root); 193 ret = join_transaction(root);
181 BUG_ON(ret); 194 BUG_ON(ret);
182 195
183 h->transid = root->fs_info->running_transaction->transid; 196 cur_trans = root->fs_info->running_transaction;
184 h->transaction = root->fs_info->running_transaction; 197 cur_trans->use_count++;
185 h->blocks_reserved = num_blocks; 198 mutex_unlock(&root->fs_info->trans_mutex);
199
200 h->transid = cur_trans->transid;
201 h->transaction = cur_trans;
186 h->blocks_used = 0; 202 h->blocks_used = 0;
187 h->block_group = 0; 203 h->block_group = 0;
188 h->alloc_exclude_nr = 0; 204 h->bytes_reserved = 0;
189 h->alloc_exclude_start = 0;
190 h->delayed_ref_updates = 0; 205 h->delayed_ref_updates = 0;
206 h->block_rsv = NULL;
191 207
192 if (!current->journal_info && type != TRANS_USERSPACE) 208 smp_mb();
193 current->journal_info = h; 209 if (cur_trans->blocked && may_wait_transaction(root, type)) {
210 btrfs_commit_transaction(h, root);
211 goto again;
212 }
213
214 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items,
216 &retries);
217 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root);
219 goto again;
220 }
221 if (ret < 0) {
222 btrfs_end_transaction(h, root);
223 return ERR_PTR(ret);
224 }
225 }
194 226
195 root->fs_info->running_transaction->use_count++; 227 mutex_lock(&root->fs_info->trans_mutex);
196 record_root_in_trans(h, root); 228 record_root_in_trans(h, root);
197 mutex_unlock(&root->fs_info->trans_mutex); 229 mutex_unlock(&root->fs_info->trans_mutex);
230
231 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h;
198 return h; 233 return h;
199} 234}
200 235
201struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 236struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
202 int num_blocks) 237 int num_items)
203{ 238{
204 return start_transaction(root, num_blocks, TRANS_START); 239 return start_transaction(root, num_items, TRANS_START);
205} 240}
206struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
207 int num_blocks) 242 int num_blocks)
208{ 243{
209 return start_transaction(root, num_blocks, TRANS_JOIN); 244 return start_transaction(root, 0, TRANS_JOIN);
210} 245}
211 246
212struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
213 int num_blocks) 248 int num_blocks)
214{ 249{
215 return start_transaction(r, num_blocks, TRANS_USERSPACE); 250 return start_transaction(r, 0, TRANS_USERSPACE);
216} 251}
217 252
218/* wait for a transaction commit to be fully complete */ 253/* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
286 mutex_unlock(&root->fs_info->trans_mutex); 321 mutex_unlock(&root->fs_info->trans_mutex);
287} 322}
288 323
324static int should_end_transaction(struct btrfs_trans_handle *trans,
325 struct btrfs_root *root)
326{
327 int ret;
328 ret = btrfs_block_rsv_check(trans, root,
329 &root->fs_info->global_block_rsv, 0, 5);
330 return ret ? 1 : 0;
331}
332
333int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
334 struct btrfs_root *root)
335{
336 struct btrfs_transaction *cur_trans = trans->transaction;
337 int updates;
338
339 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
340 return 1;
341
342 updates = trans->delayed_ref_updates;
343 trans->delayed_ref_updates = 0;
344 if (updates)
345 btrfs_run_delayed_refs(trans, root, updates);
346
347 return should_end_transaction(trans, root);
348}
349
289static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
290 struct btrfs_root *root, int throttle) 351 struct btrfs_root *root, int throttle)
291{ 352{
292 struct btrfs_transaction *cur_trans; 353 struct btrfs_transaction *cur_trans = trans->transaction;
293 struct btrfs_fs_info *info = root->fs_info; 354 struct btrfs_fs_info *info = root->fs_info;
294 int count = 0; 355 int count = 0;
295 356
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
313 count++; 374 count++;
314 } 375 }
315 376
377 btrfs_trans_release_metadata(trans, root);
378
379 if (!root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1;
382
383 if (cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle)
385 return btrfs_commit_transaction(trans, root);
386 else
387 wake_up_process(info->transaction_kthread);
388 }
389
316 mutex_lock(&info->trans_mutex); 390 mutex_lock(&info->trans_mutex);
317 cur_trans = info->running_transaction; 391 WARN_ON(cur_trans != info->running_transaction);
318 WARN_ON(cur_trans != trans->transaction);
319 WARN_ON(cur_trans->num_writers < 1); 392 WARN_ON(cur_trans->num_writers < 1);
320 cur_trans->num_writers--; 393 cur_trans->num_writers--;
321 394
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
603 676
604 btrfs_free_log(trans, root); 677 btrfs_free_log(trans, root);
605 btrfs_update_reloc_root(trans, root); 678 btrfs_update_reloc_root(trans, root);
679 btrfs_orphan_commit_root(trans, root);
606 680
607 if (root->commit_root != root->node) { 681 if (root->commit_root != root->node) {
608 switch_commit_root(root); 682 switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
627int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 701int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
628{ 702{
629 struct btrfs_fs_info *info = root->fs_info; 703 struct btrfs_fs_info *info = root->fs_info;
630 int ret;
631 struct btrfs_trans_handle *trans; 704 struct btrfs_trans_handle *trans;
705 int ret;
632 unsigned long nr; 706 unsigned long nr;
633 707
634 smp_mb(); 708 if (xchg(&root->defrag_running, 1))
635 if (root->defrag_running)
636 return 0; 709 return 0;
637 trans = btrfs_start_transaction(root, 1); 710
638 while (1) { 711 while (1) {
639 root->defrag_running = 1; 712 trans = btrfs_start_transaction(root, 0);
713 if (IS_ERR(trans))
714 return PTR_ERR(trans);
715
640 ret = btrfs_defrag_leaves(trans, root, cacheonly); 716 ret = btrfs_defrag_leaves(trans, root, cacheonly);
717
641 nr = trans->blocks_used; 718 nr = trans->blocks_used;
642 btrfs_end_transaction(trans, root); 719 btrfs_end_transaction(trans, root);
643 btrfs_btree_balance_dirty(info->tree_root, nr); 720 btrfs_btree_balance_dirty(info->tree_root, nr);
644 cond_resched(); 721 cond_resched();
645 722
646 trans = btrfs_start_transaction(root, 1);
647 if (root->fs_info->closing || ret != -EAGAIN) 723 if (root->fs_info->closing || ret != -EAGAIN)
648 break; 724 break;
649 } 725 }
650 root->defrag_running = 0; 726 root->defrag_running = 0;
651 smp_mb(); 727 return ret;
652 btrfs_end_transaction(trans, root);
653 return 0;
654} 728}
655 729
656#if 0 730#if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
758 struct btrfs_root *root = pending->root; 832 struct btrfs_root *root = pending->root;
759 struct btrfs_root *parent_root; 833 struct btrfs_root *parent_root;
760 struct inode *parent_inode; 834 struct inode *parent_inode;
835 struct dentry *dentry;
761 struct extent_buffer *tmp; 836 struct extent_buffer *tmp;
762 struct extent_buffer *old; 837 struct extent_buffer *old;
763 int ret; 838 int ret;
764 u64 objectid; 839 int retries = 0;
765 int namelen; 840 u64 to_reserve = 0;
766 u64 index = 0; 841 u64 index = 0;
767 842 u64 objectid;
768 parent_inode = pending->dentry->d_parent->d_inode;
769 parent_root = BTRFS_I(parent_inode)->root;
770 843
771 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 844 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
772 if (!new_root_item) { 845 if (!new_root_item) {
773 ret = -ENOMEM; 846 pending->error = -ENOMEM;
774 goto fail; 847 goto fail;
775 } 848 }
849
776 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 850 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
777 if (ret) 851 if (ret) {
852 pending->error = ret;
778 goto fail; 853 goto fail;
854 }
855
856 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
857 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
858
859 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries);
862 if (ret) {
863 pending->error = ret;
864 goto fail;
865 }
866 }
779 867
780 key.objectid = objectid; 868 key.objectid = objectid;
781 /* record when the snapshot was created in key.offset */ 869 key.offset = (u64)-1;
782 key.offset = trans->transid; 870 key.type = BTRFS_ROOT_ITEM_KEY;
783 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
784 871
785 memcpy(&pending->root_key, &key, sizeof(key)); 872 trans->block_rsv = &pending->block_rsv;
786 pending->root_key.offset = (u64)-1;
787 873
874 dentry = pending->dentry;
875 parent_inode = dentry->d_parent->d_inode;
876 parent_root = BTRFS_I(parent_inode)->root;
788 record_root_in_trans(trans, parent_root); 877 record_root_in_trans(trans, parent_root);
878
789 /* 879 /*
790 * insert the directory item 880 * insert the directory item
791 */ 881 */
792 namelen = strlen(pending->name);
793 ret = btrfs_set_inode_index(parent_inode, &index); 882 ret = btrfs_set_inode_index(parent_inode, &index);
794 BUG_ON(ret); 883 BUG_ON(ret);
795 ret = btrfs_insert_dir_item(trans, parent_root, 884 ret = btrfs_insert_dir_item(trans, parent_root,
796 pending->name, namelen, 885 dentry->d_name.name, dentry->d_name.len,
797 parent_inode->i_ino, 886 parent_inode->i_ino, &key,
798 &pending->root_key, BTRFS_FT_DIR, index); 887 BTRFS_FT_DIR, index);
799 BUG_ON(ret); 888 BUG_ON(ret);
800 889
801 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 890 btrfs_i_size_write(parent_inode, parent_inode->i_size +
891 dentry->d_name.len * 2);
802 ret = btrfs_update_inode(trans, parent_root, parent_inode); 892 ret = btrfs_update_inode(trans, parent_root, parent_inode);
803 BUG_ON(ret); 893 BUG_ON(ret);
804 894
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
815 free_extent_buffer(old); 905 free_extent_buffer(old);
816 906
817 btrfs_set_root_node(new_root_item, tmp); 907 btrfs_set_root_node(new_root_item, tmp);
818 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 908 /* record when the snapshot was created in key.offset */
819 new_root_item); 909 key.offset = trans->transid;
820 BUG_ON(ret); 910 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
821 btrfs_tree_unlock(tmp); 911 btrfs_tree_unlock(tmp);
822 free_extent_buffer(tmp); 912 free_extent_buffer(tmp);
913 BUG_ON(ret);
823 914
824 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 915 /*
825 pending->root_key.objectid, 916 * insert root back/forward references
917 */
918 ret = btrfs_add_root_ref(trans, tree_root, objectid,
826 parent_root->root_key.objectid, 919 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name, 920 parent_inode->i_ino, index,
828 namelen); 921 dentry->d_name.name, dentry->d_name.len);
829 BUG_ON(ret); 922 BUG_ON(ret);
830 923
924 key.offset = (u64)-1;
925 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
926 BUG_ON(IS_ERR(pending->snap));
927
928 btrfs_reloc_post_snapshot(trans, pending);
929 btrfs_orphan_post_snapshot(trans, pending);
831fail: 930fail:
832 kfree(new_root_item); 931 kfree(new_root_item);
833 return ret; 932 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
933 return 0;
834} 934}
835 935
836/* 936/*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
878 return ret; 978 return ret;
879} 979}
880 980
981int btrfs_transaction_blocked(struct btrfs_fs_info *info)
982{
983 int ret = 0;
984 spin_lock(&info->new_trans_lock);
985 if (info->running_transaction)
986 ret = info->running_transaction->blocked;
987 spin_unlock(&info->new_trans_lock);
988 return ret;
989}
990
881int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 991int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
882 struct btrfs_root *root) 992 struct btrfs_root *root)
883{ 993{
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
899 ret = btrfs_run_delayed_refs(trans, root, 0); 1009 ret = btrfs_run_delayed_refs(trans, root, 0);
900 BUG_ON(ret); 1010 BUG_ON(ret);
901 1011
1012 btrfs_trans_release_metadata(trans, root);
1013
902 cur_trans = trans->transaction; 1014 cur_trans = trans->transaction;
903 /* 1015 /*
904 * set the flushing flag so procs in this transaction have to 1016 * set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
951 snap_pending = 1; 1063 snap_pending = 1;
952 1064
953 WARN_ON(cur_trans != trans->transaction); 1065 WARN_ON(cur_trans != trans->transaction);
954 prepare_to_wait(&cur_trans->writer_wait, &wait,
955 TASK_UNINTERRUPTIBLE);
956
957 if (cur_trans->num_writers > 1) 1066 if (cur_trans->num_writers > 1)
958 timeout = MAX_SCHEDULE_TIMEOUT; 1067 timeout = MAX_SCHEDULE_TIMEOUT;
959 else if (should_grow) 1068 else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
976 */ 1085 */
977 btrfs_run_ordered_operations(root, 1); 1086 btrfs_run_ordered_operations(root, 1);
978 1087
1088 prepare_to_wait(&cur_trans->writer_wait, &wait,
1089 TASK_UNINTERRUPTIBLE);
1090
979 smp_mb(); 1091 smp_mb();
980 if (cur_trans->num_writers > 1 || should_grow) 1092 if (cur_trans->num_writers > 1 || should_grow)
981 schedule_timeout(timeout); 1093 schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1103 1215
1104 if (btrfs_header_backref_rev(root->node) < 1216 if (btrfs_header_backref_rev(root->node) <
1105 BTRFS_MIXED_BACKREF_REV) 1217 BTRFS_MIXED_BACKREF_REV)
1106 btrfs_drop_snapshot(root, 0); 1218 btrfs_drop_snapshot(root, NULL, 0);
1107 else 1219 else
1108 btrfs_drop_snapshot(root, 1); 1220 btrfs_drop_snapshot(root, NULL, 1);
1109 } 1221 }
1110 return 0; 1222 return 0;
1111} 1223}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
45 45
46struct btrfs_trans_handle { 46struct btrfs_trans_handle {
47 u64 transid; 47 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved;
48 unsigned long blocks_reserved; 50 unsigned long blocks_reserved;
49 unsigned long blocks_used; 51 unsigned long blocks_used;
50 struct btrfs_transaction *transaction;
51 u64 block_group;
52 u64 alloc_exclude_start;
53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates; 52 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv;
55}; 55};
56 56
57struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
58 struct dentry *dentry; 58 struct dentry *dentry;
59 struct btrfs_root *root; 59 struct btrfs_root *root;
60 char *name; 60 struct btrfs_root *snap;
61 struct btrfs_key root_key; 61 /* block reservation for the operation */
62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */
64 int error;
62 struct list_head list; 65 struct list_head list;
63}; 66};
64 67
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
85int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root); 89 struct btrfs_root *root);
87struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
88 int num_blocks); 91 int num_items);
89struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
90 int num_blocks); 93 int num_blocks);
91struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
92 int num_blocks); 95 int num_blocks);
93int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 97 struct btrfs_root *root);
95int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 106 struct btrfs_root *root);
104int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 108 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root);
106void btrfs_throttle(struct btrfs_root *root); 111void btrfs_throttle(struct btrfs_root *root);
107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 112int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 113 struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark); 117 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root, 118int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark); 119 struct extent_io_tree *dirty_pages, int mark);
120int btrfs_transaction_blocked(struct btrfs_fs_info *info);
115int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 121int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
116#endif 122#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
117 path->nodes[1], 0, 117 path->nodes[1], 0,
118 cache_only, &last_ret, 118 cache_only, &last_ret,
119 &root->defrag_progress); 119 &root->defrag_progress);
120 WARN_ON(ret && ret != -EAGAIN); 120 if (ret) {
121 WARN_ON(ret == -EAGAIN);
122 goto out;
123 }
121 if (next_key_ret == 0) { 124 if (next_key_ret == 0) {
122 memcpy(&root->defrag_progress, &key, sizeof(key)); 125 memcpy(&root->defrag_progress, &key, sizeof(key));
123 ret = -EAGAIN; 126 ret = -EAGAIN;
124 } 127 }
125
126 btrfs_release_path(root, path);
127out: 128out:
128 if (path) 129 if (path)
129 btrfs_free_path(path); 130 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
135 struct btrfs_root *root) 135 struct btrfs_root *root)
136{ 136{
137 int ret; 137 int ret;
138 int err = 0;
138 139
139 mutex_lock(&root->log_mutex); 140 mutex_lock(&root->log_mutex);
140 if (root->log_root) { 141 if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
155 mutex_lock(&root->fs_info->tree_log_mutex); 156 mutex_lock(&root->fs_info->tree_log_mutex);
156 if (!root->fs_info->log_root_tree) { 157 if (!root->fs_info->log_root_tree) {
157 ret = btrfs_init_log_root_tree(trans, root->fs_info); 158 ret = btrfs_init_log_root_tree(trans, root->fs_info);
158 BUG_ON(ret); 159 if (ret)
160 err = ret;
159 } 161 }
160 if (!root->log_root) { 162 if (err == 0 && !root->log_root) {
161 ret = btrfs_add_log_tree(trans, root); 163 ret = btrfs_add_log_tree(trans, root);
162 BUG_ON(ret); 164 if (ret)
165 err = ret;
163 } 166 }
164 mutex_unlock(&root->fs_info->tree_log_mutex); 167 mutex_unlock(&root->fs_info->tree_log_mutex);
165 root->log_batch++; 168 root->log_batch++;
166 atomic_inc(&root->log_writers); 169 atomic_inc(&root->log_writers);
167 mutex_unlock(&root->log_mutex); 170 mutex_unlock(&root->log_mutex);
168 return 0; 171 return err;
169} 172}
170 173
171/* 174/*
@@ -376,7 +379,7 @@ insert:
376 BUG_ON(ret); 379 BUG_ON(ret);
377 } 380 }
378 } else if (ret) { 381 } else if (ret) {
379 BUG(); 382 return ret;
380 } 383 }
381 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
382 path->slots[0]); 385 path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1699 1702
1700 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1701 1704
1702 wc->process_func(root, next, wc, ptr_gen);
1703
1704 if (*level == 1) { 1705 if (*level == 1) {
1706 wc->process_func(root, next, wc, ptr_gen);
1707
1705 path->slots[*level]++; 1708 path->slots[*level]++;
1706 if (wc->free) { 1709 if (wc->free) {
1707 btrfs_read_buffer(next, ptr_gen); 1710 btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1734 WARN_ON(*level < 0); 1737 WARN_ON(*level < 0);
1735 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1738 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1736 1739
1737 if (path->nodes[*level] == root->node) 1740 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1738 parent = path->nodes[*level];
1739 else
1740 parent = path->nodes[*level + 1];
1741
1742 bytenr = path->nodes[*level]->start;
1743
1744 blocksize = btrfs_level_size(root, *level);
1745 root_owner = btrfs_header_owner(parent);
1746 root_gen = btrfs_header_generation(parent);
1747
1748 wc->process_func(root, path->nodes[*level], wc,
1749 btrfs_header_generation(path->nodes[*level]));
1750
1751 if (wc->free) {
1752 next = path->nodes[*level];
1753 btrfs_tree_lock(next);
1754 clean_tree_block(trans, root, next);
1755 btrfs_set_lock_blocking(next);
1756 btrfs_wait_tree_block_writeback(next);
1757 btrfs_tree_unlock(next);
1758
1759 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1760 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1761 BUG_ON(ret);
1762 }
1763 free_extent_buffer(path->nodes[*level]);
1764 path->nodes[*level] = NULL;
1765 *level += 1;
1766 1741
1767 cond_resched(); 1742 cond_resched();
1768 return 0; 1743 return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1781 1756
1782 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1783 slot = path->slots[i]; 1758 slot = path->slots[i];
1784 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1785 struct extent_buffer *node; 1760 struct extent_buffer *node;
1786 node = path->nodes[i]; 1761 node = path->nodes[i];
1787 path->slots[i]++; 1762 path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2047 mutex_unlock(&log_root_tree->log_mutex); 2022 mutex_unlock(&log_root_tree->log_mutex);
2048 2023
2049 ret = update_log_root(trans, log); 2024 ret = update_log_root(trans, log);
2050 BUG_ON(ret);
2051 2025
2052 mutex_lock(&log_root_tree->log_mutex); 2026 mutex_lock(&log_root_tree->log_mutex);
2053 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2027 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2056 wake_up(&log_root_tree->log_writer_wait); 2030 wake_up(&log_root_tree->log_writer_wait);
2057 } 2031 }
2058 2032
2033 if (ret) {
2034 BUG_ON(ret != -ENOSPC);
2035 root->fs_info->last_trans_log_full_commit = trans->transid;
2036 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2037 mutex_unlock(&log_root_tree->log_mutex);
2038 ret = -EAGAIN;
2039 goto out;
2040 }
2041
2059 index2 = log_root_tree->log_transid % 2; 2042 index2 = log_root_tree->log_transid % 2;
2060 if (atomic_read(&log_root_tree->log_commit[index2])) { 2043 if (atomic_read(&log_root_tree->log_commit[index2])) {
2061 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2044 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
2129 return 0; 2112 return 0;
2130} 2113}
2131 2114
2132/* 2115static void free_log_tree(struct btrfs_trans_handle *trans,
2133 * free all the extents used by the tree log. This should be called 2116 struct btrfs_root *log)
2134 * at commit time of the full transaction
2135 */
2136int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2137{ 2117{
2138 int ret; 2118 int ret;
2139 struct btrfs_root *log;
2140 struct key;
2141 u64 start; 2119 u64 start;
2142 u64 end; 2120 u64 end;
2143 struct walk_control wc = { 2121 struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2145 .process_func = process_one_buffer 2123 .process_func = process_one_buffer
2146 }; 2124 };
2147 2125
2148 if (!root->log_root || root->fs_info->log_root_recovering)
2149 return 0;
2150
2151 log = root->log_root;
2152 ret = walk_log_tree(trans, log, &wc); 2126 ret = walk_log_tree(trans, log, &wc);
2153 BUG_ON(ret); 2127 BUG_ON(ret);
2154 2128
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2162 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2136 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2163 } 2137 }
2164 2138
2165 if (log->log_transid > 0) {
2166 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2167 &log->root_key);
2168 BUG_ON(ret);
2169 }
2170 root->log_root = NULL;
2171 free_extent_buffer(log->node); 2139 free_extent_buffer(log->node);
2172 kfree(log); 2140 kfree(log);
2141}
2142
2143/*
2144 * free all the extents used by the tree log. This should be called
2145 * at commit time of the full transaction
2146 */
2147int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2148{
2149 if (root->log_root) {
2150 free_log_tree(trans, root->log_root);
2151 root->log_root = NULL;
2152 }
2153 return 0;
2154}
2155
2156int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2157 struct btrfs_fs_info *fs_info)
2158{
2159 if (fs_info->log_root_tree) {
2160 free_log_tree(trans, fs_info->log_root_tree);
2161 fs_info->log_root_tree = NULL;
2162 }
2173 return 0; 2163 return 0;
2174} 2164}
2175 2165
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2203 struct btrfs_dir_item *di; 2193 struct btrfs_dir_item *di;
2204 struct btrfs_path *path; 2194 struct btrfs_path *path;
2205 int ret; 2195 int ret;
2196 int err = 0;
2206 int bytes_del = 0; 2197 int bytes_del = 0;
2207 2198
2208 if (BTRFS_I(dir)->logged_trans < trans->transid) 2199 if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2218 path = btrfs_alloc_path(); 2209 path = btrfs_alloc_path();
2219 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2210 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2220 name, name_len, -1); 2211 name, name_len, -1);
2221 if (di && !IS_ERR(di)) { 2212 if (IS_ERR(di)) {
2213 err = PTR_ERR(di);
2214 goto fail;
2215 }
2216 if (di) {
2222 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2217 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2223 bytes_del += name_len; 2218 bytes_del += name_len;
2224 BUG_ON(ret); 2219 BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2226 btrfs_release_path(log, path); 2221 btrfs_release_path(log, path);
2227 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2228 index, name, name_len, -1); 2223 index, name, name_len, -1);
2229 if (di && !IS_ERR(di)) { 2224 if (IS_ERR(di)) {
2225 err = PTR_ERR(di);
2226 goto fail;
2227 }
2228 if (di) {
2230 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2229 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2231 bytes_del += name_len; 2230 bytes_del += name_len;
2232 BUG_ON(ret); 2231 BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2244 btrfs_release_path(log, path); 2243 btrfs_release_path(log, path);
2245 2244
2246 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2246 if (ret < 0) {
2247 err = ret;
2248 goto fail;
2249 }
2247 if (ret == 0) { 2250 if (ret == 0) {
2248 struct btrfs_inode_item *item; 2251 struct btrfs_inode_item *item;
2249 u64 i_size; 2252 u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2261 ret = 0; 2264 ret = 0;
2262 btrfs_release_path(log, path); 2265 btrfs_release_path(log, path);
2263 } 2266 }
2264 2267fail:
2265 btrfs_free_path(path); 2268 btrfs_free_path(path);
2266 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2269 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2270 if (ret == -ENOSPC) {
2271 root->fs_info->last_trans_log_full_commit = trans->transid;
2272 ret = 0;
2273 }
2267 btrfs_end_log_trans(root); 2274 btrfs_end_log_trans(root);
2268 2275
2269 return 0; 2276 return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2291 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2298 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2292 dirid, &index); 2299 dirid, &index);
2293 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2300 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2301 if (ret == -ENOSPC) {
2302 root->fs_info->last_trans_log_full_commit = trans->transid;
2303 ret = 0;
2304 }
2294 btrfs_end_log_trans(root); 2305 btrfs_end_log_trans(root);
2295 2306
2296 return ret; 2307 return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2318 else 2329 else
2319 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2330 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2320 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2331 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2321 BUG_ON(ret); 2332 if (ret)
2333 return ret;
2322 2334
2323 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2335 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2324 struct btrfs_dir_log_item); 2336 struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2343 struct btrfs_key max_key; 2355 struct btrfs_key max_key;
2344 struct btrfs_root *log = root->log_root; 2356 struct btrfs_root *log = root->log_root;
2345 struct extent_buffer *src; 2357 struct extent_buffer *src;
2358 int err = 0;
2346 int ret; 2359 int ret;
2347 int i; 2360 int i;
2348 int nritems; 2361 int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2405 ret = overwrite_item(trans, log, dst_path, 2418 ret = overwrite_item(trans, log, dst_path,
2406 path->nodes[0], path->slots[0], 2419 path->nodes[0], path->slots[0],
2407 &tmp); 2420 &tmp);
2421 if (ret) {
2422 err = ret;
2423 goto done;
2424 }
2408 } 2425 }
2409 } 2426 }
2410 btrfs_release_path(root, path); 2427 btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2432 goto done; 2449 goto done;
2433 ret = overwrite_item(trans, log, dst_path, src, i, 2450 ret = overwrite_item(trans, log, dst_path, src, i,
2434 &min_key); 2451 &min_key);
2435 BUG_ON(ret); 2452 if (ret) {
2453 err = ret;
2454 goto done;
2455 }
2436 } 2456 }
2437 path->slots[0] = nritems; 2457 path->slots[0] = nritems;
2438 2458
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2454 ret = overwrite_item(trans, log, dst_path, 2474 ret = overwrite_item(trans, log, dst_path,
2455 path->nodes[0], path->slots[0], 2475 path->nodes[0], path->slots[0],
2456 &tmp); 2476 &tmp);
2457 2477 if (ret)
2458 BUG_ON(ret); 2478 err = ret;
2459 last_offset = tmp.offset; 2479 else
2480 last_offset = tmp.offset;
2460 goto done; 2481 goto done;
2461 } 2482 }
2462 } 2483 }
2463done: 2484done:
2464 *last_offset_ret = last_offset;
2465 btrfs_release_path(root, path); 2485 btrfs_release_path(root, path);
2466 btrfs_release_path(log, dst_path); 2486 btrfs_release_path(log, dst_path);
2467 2487
2468 /* insert the log range keys to indicate where the log is valid */ 2488 if (err == 0) {
2469 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2489 *last_offset_ret = last_offset;
2470 first_offset, last_offset); 2490 /*
2471 BUG_ON(ret); 2491 * insert the log range keys to indicate where the log
2472 return 0; 2492 * is valid
2493 */
2494 ret = insert_dir_log_key(trans, log, path, key_type,
2495 inode->i_ino, first_offset,
2496 last_offset);
2497 if (ret)
2498 err = ret;
2499 }
2500 return err;
2473} 2501}
2474 2502
2475/* 2503/*
@@ -2501,7 +2529,8 @@ again:
2501 ret = log_dir_items(trans, root, inode, path, 2529 ret = log_dir_items(trans, root, inode, path,
2502 dst_path, key_type, min_key, 2530 dst_path, key_type, min_key,
2503 &max_key); 2531 &max_key);
2504 BUG_ON(ret); 2532 if (ret)
2533 return ret;
2505 if (max_key == (u64)-1) 2534 if (max_key == (u64)-1)
2506 break; 2535 break;
2507 min_key = max_key + 1; 2536 min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2535 2564
2536 while (1) { 2565 while (1) {
2537 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2566 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2538 2567 BUG_ON(ret == 0);
2539 if (ret != 1) 2568 if (ret < 0)
2540 break; 2569 break;
2541 2570
2542 if (path->slots[0] == 0) 2571 if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2554 btrfs_release_path(log, path); 2583 btrfs_release_path(log, path);
2555 } 2584 }
2556 btrfs_release_path(log, path); 2585 btrfs_release_path(log, path);
2557 return 0; 2586 return ret;
2558} 2587}
2559 2588
2560static noinline int copy_items(struct btrfs_trans_handle *trans, 2589static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2587 } 2616 }
2588 ret = btrfs_insert_empty_items(trans, log, dst_path, 2617 ret = btrfs_insert_empty_items(trans, log, dst_path,
2589 ins_keys, ins_sizes, nr); 2618 ins_keys, ins_sizes, nr);
2590 BUG_ON(ret); 2619 if (ret) {
2620 kfree(ins_data);
2621 return ret;
2622 }
2591 2623
2592 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2624 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2593 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2625 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2660 * we have to do this after the loop above to avoid changing the 2692 * we have to do this after the loop above to avoid changing the
2661 * log tree while trying to change the log tree. 2693 * log tree while trying to change the log tree.
2662 */ 2694 */
2695 ret = 0;
2663 while (!list_empty(&ordered_sums)) { 2696 while (!list_empty(&ordered_sums)) {
2664 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2697 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2665 struct btrfs_ordered_sum, 2698 struct btrfs_ordered_sum,
2666 list); 2699 list);
2667 ret = btrfs_csum_file_blocks(trans, log, sums); 2700 if (!ret)
2668 BUG_ON(ret); 2701 ret = btrfs_csum_file_blocks(trans, log, sums);
2669 list_del(&sums->list); 2702 list_del(&sums->list);
2670 kfree(sums); 2703 kfree(sums);
2671 } 2704 }
2672 return 0; 2705 return ret;
2673} 2706}
2674 2707
2675/* log a single inode in the tree log. 2708/* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2697 struct btrfs_root *log = root->log_root; 2730 struct btrfs_root *log = root->log_root;
2698 struct extent_buffer *src = NULL; 2731 struct extent_buffer *src = NULL;
2699 u32 size; 2732 u32 size;
2733 int err = 0;
2700 int ret; 2734 int ret;
2701 int nritems; 2735 int nritems;
2702 int ins_start_slot = 0; 2736 int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2739 } else { 2773 } else {
2740 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2774 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2741 } 2775 }
2742 BUG_ON(ret); 2776 if (ret) {
2777 err = ret;
2778 goto out_unlock;
2779 }
2743 path->keep_locks = 1; 2780 path->keep_locks = 1;
2744 2781
2745 while (1) { 2782 while (1) {
@@ -2768,7 +2805,10 @@ again:
2768 2805
2769 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2806 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2770 ins_nr, inode_only); 2807 ins_nr, inode_only);
2771 BUG_ON(ret); 2808 if (ret) {
2809 err = ret;
2810 goto out_unlock;
2811 }
2772 ins_nr = 1; 2812 ins_nr = 1;
2773 ins_start_slot = path->slots[0]; 2813 ins_start_slot = path->slots[0];
2774next_slot: 2814next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
2784 ret = copy_items(trans, log, dst_path, src, 2824 ret = copy_items(trans, log, dst_path, src,
2785 ins_start_slot, 2825 ins_start_slot,
2786 ins_nr, inode_only); 2826 ins_nr, inode_only);
2787 BUG_ON(ret); 2827 if (ret) {
2828 err = ret;
2829 goto out_unlock;
2830 }
2788 ins_nr = 0; 2831 ins_nr = 0;
2789 } 2832 }
2790 btrfs_release_path(root, path); 2833 btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
2802 ret = copy_items(trans, log, dst_path, src, 2845 ret = copy_items(trans, log, dst_path, src,
2803 ins_start_slot, 2846 ins_start_slot,
2804 ins_nr, inode_only); 2847 ins_nr, inode_only);
2805 BUG_ON(ret); 2848 if (ret) {
2849 err = ret;
2850 goto out_unlock;
2851 }
2806 ins_nr = 0; 2852 ins_nr = 0;
2807 } 2853 }
2808 WARN_ON(ins_nr); 2854 WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
2810 btrfs_release_path(root, path); 2856 btrfs_release_path(root, path);
2811 btrfs_release_path(log, dst_path); 2857 btrfs_release_path(log, dst_path);
2812 ret = log_directory_changes(trans, root, inode, path, dst_path); 2858 ret = log_directory_changes(trans, root, inode, path, dst_path);
2813 BUG_ON(ret); 2859 if (ret) {
2860 err = ret;
2861 goto out_unlock;
2862 }
2814 } 2863 }
2815 BTRFS_I(inode)->logged_trans = trans->transid; 2864 BTRFS_I(inode)->logged_trans = trans->transid;
2865out_unlock:
2816 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2866 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2817 2867
2818 btrfs_free_path(path); 2868 btrfs_free_path(path);
2819 btrfs_free_path(dst_path); 2869 btrfs_free_path(dst_path);
2820 return 0; 2870 return err;
2821} 2871}
2822 2872
2823/* 2873/*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2942 goto end_no_trans; 2992 goto end_no_trans;
2943 } 2993 }
2944 2994
2945 start_log_trans(trans, root); 2995 ret = start_log_trans(trans, root);
2996 if (ret)
2997 goto end_trans;
2946 2998
2947 ret = btrfs_log_inode(trans, root, inode, inode_only); 2999 ret = btrfs_log_inode(trans, root, inode, inode_only);
2948 BUG_ON(ret); 3000 if (ret)
3001 goto end_trans;
2949 3002
2950 /* 3003 /*
2951 * for regular files, if its inode is already on disk, we don't 3004 * for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2955 */ 3008 */
2956 if (S_ISREG(inode->i_mode) && 3009 if (S_ISREG(inode->i_mode) &&
2957 BTRFS_I(inode)->generation <= last_committed && 3010 BTRFS_I(inode)->generation <= last_committed &&
2958 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3011 BTRFS_I(inode)->last_unlink_trans <= last_committed) {
2959 goto no_parent; 3012 ret = 0;
3013 goto end_trans;
3014 }
2960 3015
2961 inode_only = LOG_INODE_EXISTS; 3016 inode_only = LOG_INODE_EXISTS;
2962 while (1) { 3017 while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2970 if (BTRFS_I(inode)->generation > 3025 if (BTRFS_I(inode)->generation >
2971 root->fs_info->last_trans_committed) { 3026 root->fs_info->last_trans_committed) {
2972 ret = btrfs_log_inode(trans, root, inode, inode_only); 3027 ret = btrfs_log_inode(trans, root, inode, inode_only);
2973 BUG_ON(ret); 3028 if (ret)
3029 goto end_trans;
2974 } 3030 }
2975 if (IS_ROOT(parent)) 3031 if (IS_ROOT(parent))
2976 break; 3032 break;
2977 3033
2978 parent = parent->d_parent; 3034 parent = parent->d_parent;
2979 } 3035 }
2980no_parent:
2981 ret = 0; 3036 ret = 0;
3037end_trans:
3038 if (ret < 0) {
3039 BUG_ON(ret != -ENOSPC);
3040 root->fs_info->last_trans_log_full_commit = trans->transid;
3041 ret = 1;
3042 }
2982 btrfs_end_log_trans(root); 3043 btrfs_end_log_trans(root);
2983end_no_trans: 3044end_no_trans:
2984 return ret; 3045 return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3020 path = btrfs_alloc_path(); 3081 path = btrfs_alloc_path();
3021 BUG_ON(!path); 3082 BUG_ON(!path);
3022 3083
3023 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3084 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3024 3085
3025 wc.trans = trans; 3086 wc.trans = trans;
3026 wc.pin = 1; 3087 wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 26 struct btrfs_root *root);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info);
28int btrfs_recover_log_trees(struct btrfs_root *tree_root); 30int btrfs_recover_log_trees(struct btrfs_root *tree_root);
29int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, struct dentry *dentry); 32 struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1097 if (!path) 1097 if (!path)
1098 return -ENOMEM; 1098 return -ENOMEM;
1099 1099
1100 trans = btrfs_start_transaction(root, 1); 1100 trans = btrfs_start_transaction(root, 0);
1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1102 key.type = BTRFS_DEV_ITEM_KEY; 1102 key.type = BTRFS_DEV_ITEM_KEY;
1103 key.offset = device->devid; 1103 key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1486 goto error; 1486 goto error;
1487 } 1487 }
1488 1488
1489 trans = btrfs_start_transaction(root, 1); 1489 trans = btrfs_start_transaction(root, 0);
1490 lock_chunks(root); 1490 lock_chunks(root);
1491 1491
1492 device->barriers = 1; 1492 device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1751 1751
1752 /* step one, relocate all the extents inside this chunk */ 1752 /* step one, relocate all the extents inside this chunk */
1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1754 BUG_ON(ret); 1754 if (ret)
1755 return ret;
1755 1756
1756 trans = btrfs_start_transaction(root, 1); 1757 trans = btrfs_start_transaction(root, 0);
1757 BUG_ON(!trans); 1758 BUG_ON(!trans);
1758 1759
1759 lock_chunks(root); 1760 lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1925 break; 1926 break;
1926 BUG_ON(ret); 1927 BUG_ON(ret);
1927 1928
1928 trans = btrfs_start_transaction(dev_root, 1); 1929 trans = btrfs_start_transaction(dev_root, 0);
1929 BUG_ON(!trans); 1930 BUG_ON(!trans);
1930 1931
1931 ret = btrfs_grow_device(trans, device, old_size); 1932 ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
2094 } 2095 }
2095 2096
2096 /* Shrinking succeeded, else we would be at "done". */ 2097 /* Shrinking succeeded, else we would be at "done". */
2097 trans = btrfs_start_transaction(root, 1); 2098 trans = btrfs_start_transaction(root, 0);
2098 if (!trans) {
2099 ret = -ENOMEM;
2100 goto done;
2101 }
2102 lock_chunks(root); 2099 lock_chunks(root);
2103 2100
2104 device->disk_total_bytes = new_size; 2101 device->disk_total_bytes = new_size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 59acd3eb288a..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
154 if (trans) 154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags); 155 return do_setxattr(trans, inode, name, value, size, flags);
156 156
157 ret = btrfs_reserve_metadata_space(root, 2); 157 trans = btrfs_start_transaction(root, 2);
158 if (ret) 158 if (IS_ERR(trans))
159 return ret; 159 return PTR_ERR(trans);
160 160
161 trans = btrfs_start_transaction(root, 1);
162 if (!trans) {
163 ret = -ENOMEM;
164 goto out;
165 }
166 btrfs_set_trans_block_group(trans, inode); 161 btrfs_set_trans_block_group(trans, inode);
167 162
168 ret = do_setxattr(trans, inode, name, value, size, flags); 163 ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
174 BUG_ON(ret); 169 BUG_ON(ret);
175out: 170out:
176 btrfs_end_transaction_throttle(trans, root); 171 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
178 return ret; 172 return ret;
179} 173}
180 174
diff --git a/fs/buffer.c b/fs/buffer.c
index e8aa7081d25c..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1949} 1949}
1950 1950
1951/* 1951/*
1952 * block_write_begin takes care of the basic task of block allocation and 1952 * Filesystems implementing the new truncate sequence should use the
1953 * bringing partial write blocks uptodate first. 1953 * _newtrunc postfix variant which won't incorrectly call vmtruncate.
1954 * 1954 * The filesystem needs to handle block truncation upon failure.
1955 * If *pagep is not NULL, then block_write_begin uses the locked page
1956 * at *pagep rather than allocating its own. In this case, the page will
1957 * not be unlocked or deallocated on failure.
1958 */ 1955 */
1959int block_write_begin(struct file *file, struct address_space *mapping, 1956int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
1960 loff_t pos, unsigned len, unsigned flags, 1957 loff_t pos, unsigned len, unsigned flags,
1961 struct page **pagep, void **fsdata, 1958 struct page **pagep, void **fsdata,
1962 get_block_t *get_block) 1959 get_block_t *get_block)
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
1992 unlock_page(page); 1989 unlock_page(page);
1993 page_cache_release(page); 1990 page_cache_release(page);
1994 *pagep = NULL; 1991 *pagep = NULL;
1995
1996 /*
1997 * prepare_write() may have instantiated a few blocks
1998 * outside i_size. Trim these off again. Don't need
1999 * i_size_read because we hold i_mutex.
2000 */
2001 if (pos + len > inode->i_size)
2002 vmtruncate(inode, inode->i_size);
2003 } 1992 }
2004 } 1993 }
2005 1994
2006out: 1995out:
2007 return status; 1996 return status;
2008} 1997}
1998EXPORT_SYMBOL(block_write_begin_newtrunc);
1999
2000/*
2001 * block_write_begin takes care of the basic task of block allocation and
2002 * bringing partial write blocks uptodate first.
2003 *
2004 * If *pagep is not NULL, then block_write_begin uses the locked page
2005 * at *pagep rather than allocating its own. In this case, the page will
2006 * not be unlocked or deallocated on failure.
2007 */
2008int block_write_begin(struct file *file, struct address_space *mapping,
2009 loff_t pos, unsigned len, unsigned flags,
2010 struct page **pagep, void **fsdata,
2011 get_block_t *get_block)
2012{
2013 int ret;
2014
2015 ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
2016 pagep, fsdata, get_block);
2017
2018 /*
2019 * prepare_write() may have instantiated a few blocks
2020 * outside i_size. Trim these off again. Don't need
2021 * i_size_read because we hold i_mutex.
2022 *
2023 * Filesystems which pass down their own page also cannot
2024 * call into vmtruncate here because it would lead to lock
2025 * inversion problems (*pagep is locked). This is a further
2026 * example of where the old truncate sequence is inadequate.
2027 */
2028 if (unlikely(ret) && *pagep == NULL) {
2029 loff_t isize = mapping->host->i_size;
2030 if (pos + len > isize)
2031 vmtruncate(mapping->host, isize);
2032 }
2033
2034 return ret;
2035}
2009EXPORT_SYMBOL(block_write_begin); 2036EXPORT_SYMBOL(block_write_begin);
2010 2037
2011int block_write_end(struct file *file, struct address_space *mapping, 2038int block_write_end(struct file *file, struct address_space *mapping,
@@ -2324,7 +2351,7 @@ out:
2324 * For moronic filesystems that do not allow holes in file. 2351 * For moronic filesystems that do not allow holes in file.
2325 * We may have to extend the file. 2352 * We may have to extend the file.
2326 */ 2353 */
2327int cont_write_begin(struct file *file, struct address_space *mapping, 2354int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2328 loff_t pos, unsigned len, unsigned flags, 2355 loff_t pos, unsigned len, unsigned flags,
2329 struct page **pagep, void **fsdata, 2356 struct page **pagep, void **fsdata,
2330 get_block_t *get_block, loff_t *bytes) 2357 get_block_t *get_block, loff_t *bytes)
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
2345 } 2372 }
2346 2373
2347 *pagep = NULL; 2374 *pagep = NULL;
2348 err = block_write_begin(file, mapping, pos, len, 2375 err = block_write_begin_newtrunc(file, mapping, pos, len,
2349 flags, pagep, fsdata, get_block); 2376 flags, pagep, fsdata, get_block);
2350out: 2377out:
2351 return err; 2378 return err;
2352} 2379}
2380EXPORT_SYMBOL(cont_write_begin_newtrunc);
2381
2382int cont_write_begin(struct file *file, struct address_space *mapping,
2383 loff_t pos, unsigned len, unsigned flags,
2384 struct page **pagep, void **fsdata,
2385 get_block_t *get_block, loff_t *bytes)
2386{
2387 int ret;
2388
2389 ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
2390 pagep, fsdata, get_block, bytes);
2391 if (unlikely(ret)) {
2392 loff_t isize = mapping->host->i_size;
2393 if (pos + len > isize)
2394 vmtruncate(mapping->host, isize);
2395 }
2396
2397 return ret;
2398}
2353EXPORT_SYMBOL(cont_write_begin); 2399EXPORT_SYMBOL(cont_write_begin);
2354 2400
2355int block_prepare_write(struct page *page, unsigned from, unsigned to, 2401int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
2381 * 2427 *
2382 * We are not allowed to take the i_mutex here so we have to play games to 2428 * We are not allowed to take the i_mutex here so we have to play games to
2383 * protect against truncate races as the page could now be beyond EOF. Because 2429 * protect against truncate races as the page could now be beyond EOF. Because
2384 * vmtruncate() writes the inode size before removing pages, once we have the 2430 * truncate writes the inode size before removing pages, once we have the
2385 * page lock we can determine safely if the page is beyond EOF. If it is not 2431 * page lock we can determine safely if the page is beyond EOF. If it is not
2386 * beyond EOF, then the page is guaranteed safe against truncation until we 2432 * beyond EOF, then the page is guaranteed safe against truncation until we
2387 * unlock the page. 2433 * unlock the page.
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2464} 2510}
2465 2511
2466/* 2512/*
2467 * On entry, the page is fully not uptodate. 2513 * Filesystems implementing the new truncate sequence should use the
2468 * On exit the page is fully uptodate in the areas outside (from,to) 2514 * _newtrunc postfix variant which won't incorrectly call vmtruncate.
2515 * The filesystem needs to handle block truncation upon failure.
2469 */ 2516 */
2470int nobh_write_begin(struct file *file, struct address_space *mapping, 2517int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2471 loff_t pos, unsigned len, unsigned flags, 2518 loff_t pos, unsigned len, unsigned flags,
2472 struct page **pagep, void **fsdata, 2519 struct page **pagep, void **fsdata,
2473 get_block_t *get_block) 2520 get_block_t *get_block)
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
2500 unlock_page(page); 2547 unlock_page(page);
2501 page_cache_release(page); 2548 page_cache_release(page);
2502 *pagep = NULL; 2549 *pagep = NULL;
2503 return block_write_begin(file, mapping, pos, len, flags, pagep, 2550 return block_write_begin_newtrunc(file, mapping, pos, len,
2504 fsdata, get_block); 2551 flags, pagep, fsdata, get_block);
2505 } 2552 }
2506 2553
2507 if (PageMappedToDisk(page)) 2554 if (PageMappedToDisk(page))
@@ -2605,8 +2652,34 @@ out_release:
2605 page_cache_release(page); 2652 page_cache_release(page);
2606 *pagep = NULL; 2653 *pagep = NULL;
2607 2654
2608 if (pos + len > inode->i_size) 2655 return ret;
2609 vmtruncate(inode, inode->i_size); 2656}
2657EXPORT_SYMBOL(nobh_write_begin_newtrunc);
2658
2659/*
2660 * On entry, the page is fully not uptodate.
2661 * On exit the page is fully uptodate in the areas outside (from,to)
2662 */
2663int nobh_write_begin(struct file *file, struct address_space *mapping,
2664 loff_t pos, unsigned len, unsigned flags,
2665 struct page **pagep, void **fsdata,
2666 get_block_t *get_block)
2667{
2668 int ret;
2669
2670 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
2671 pagep, fsdata, get_block);
2672
2673 /*
2674 * prepare_write() may have instantiated a few blocks
2675 * outside i_size. Trim these off again. Don't need
2676 * i_size_read because we hold i_mutex.
2677 */
2678 if (unlikely(ret)) {
2679 loff_t isize = mapping->host->i_size;
2680 if (pos + len > isize)
2681 vmtruncate(mapping->host, isize);
2682 }
2610 2683
2611 return ret; 2684 return ret;
2612} 2685}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 9f46de2ba7a7..89490beaf537 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,7 +1,6 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h> 4#include <linux/err.h>
6#include <linux/slab.h> 5#include <linux/slab.h>
7 6
@@ -217,8 +216,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
217 if (ac->protocol != protocol) { 216 if (ac->protocol != protocol) {
218 ret = ceph_auth_init_protocol(ac, protocol); 217 ret = ceph_auth_init_protocol(ac, protocol);
219 if (ret) { 218 if (ret) {
220 pr_err("error %d on auth method %s init\n", 219 pr_err("error %d on auth protocol %d init\n",
221 ret, ac->ops->name); 220 ret, protocol);
222 goto out; 221 goto out;
223 } 222 }
224 } 223 }
@@ -247,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
247 if (!ac->protocol) 246 if (!ac->protocol)
248 return ceph_auth_build_hello(ac, msg_buf, msg_len); 247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
249 BUG_ON(!ac->ops); 248 BUG_ON(!ac->ops);
250 if (!ac->ops->is_authenticated(ac)) 249 if (ac->ops->should_authenticate(ac))
251 return ceph_build_auth_request(ac, msg_buf, msg_len); 250 return ceph_build_auth_request(ac, msg_buf, msg_len);
252 return 0; 251 return 0;
253} 252}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index 4429a707c021..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -24,6 +24,12 @@ struct ceph_auth_client_ops {
24 int (*is_authenticated)(struct ceph_auth_client *ac); 24 int (*is_authenticated)(struct ceph_auth_client *ac);
25 25
26 /* 26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
27 * build requests and process replies during monitor 33 * build requests and process replies during monitor
28 * handshake. if handle_reply returns -EAGAIN, we build 34 * handshake. if handle_reply returns -EAGAIN, we build
29 * another request. 35 * another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 24407c119291..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
31 return !xi->starting; 31 return !xi->starting;
32} 32}
33 33
34static int should_authenticate(struct ceph_auth_client *ac)
35{
36 struct ceph_auth_none_info *xi = ac->private;
37
38 return xi->starting;
39}
40
34/* 41/*
35 * the generic auth code decode the global_id, and we carry no actual 42 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here. 43 * authenticate state, so nothing happens here.
@@ -98,6 +105,7 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = {
98 .reset = reset, 105 .reset = reset,
99 .destroy = destroy, 106 .destroy = destroy,
100 .is_authenticated = is_authenticated, 107 .is_authenticated = is_authenticated,
108 .should_authenticate = should_authenticate,
101 .handle_reply = handle_reply, 109 .handle_reply = handle_reply,
102 .create_authorizer = ceph_auth_none_create_authorizer, 110 .create_authorizer = ceph_auth_none_create_authorizer,
103 .destroy_authorizer = ceph_auth_none_destroy_authorizer, 111 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 7b206231566d..83d4d2785ffe 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
27 return (ac->want_keys & xi->have_keys) == ac->want_keys; 27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28} 28}
29 29
30static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31{
32 struct ceph_x_info *xi = ac->private;
33 int need;
34
35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys);
38 return need != 0;
39}
40
30static int ceph_x_encrypt_buflen(int ilen) 41static int ceph_x_encrypt_buflen(int ilen)
31{ 42{
32 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + 43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
@@ -620,6 +631,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
620static const struct ceph_auth_client_ops ceph_x_ops = { 631static const struct ceph_auth_client_ops ceph_x_ops = {
621 .name = "x", 632 .name = "x",
622 .is_authenticated = ceph_x_is_authenticated, 633 .is_authenticated = ceph_x_is_authenticated,
634 .should_authenticate = ceph_x_should_authenticate,
623 .build_request = ceph_x_build_request, 635 .build_request = ceph_x_build_request,
624 .handle_reply = ceph_x_handle_reply, 636 .handle_reply = ceph_x_handle_reply,
625 .create_authorizer = ceph_x_create_authorizer, 637 .create_authorizer = ceph_x_create_authorizer,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0dd0b81e64f7..619b61655ee5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -981,6 +981,46 @@ static int send_cap_msg(struct ceph_mds_session *session,
981 return 0; 981 return 0;
982} 982}
983 983
984static void __queue_cap_release(struct ceph_mds_session *session,
985 u64 ino, u64 cap_id, u32 migrate_seq,
986 u32 issue_seq)
987{
988 struct ceph_msg *msg;
989 struct ceph_mds_cap_release *head;
990 struct ceph_mds_cap_item *item;
991
992 spin_lock(&session->s_cap_lock);
993 BUG_ON(!session->s_num_cap_releases);
994 msg = list_first_entry(&session->s_cap_releases,
995 struct ceph_msg, list_head);
996
997 dout(" adding %llx release to mds%d msg %p (%d left)\n",
998 ino, session->s_mds, msg, session->s_num_cap_releases);
999
1000 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1001 head = msg->front.iov_base;
1002 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1003 item = msg->front.iov_base + msg->front.iov_len;
1004 item->ino = cpu_to_le64(ino);
1005 item->cap_id = cpu_to_le64(cap_id);
1006 item->migrate_seq = cpu_to_le32(migrate_seq);
1007 item->seq = cpu_to_le32(issue_seq);
1008
1009 session->s_num_cap_releases--;
1010
1011 msg->front.iov_len += sizeof(*item);
1012 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1013 dout(" release msg %p full\n", msg);
1014 list_move_tail(&msg->list_head, &session->s_cap_releases_done);
1015 } else {
1016 dout(" release msg %p at %d/%d (%d)\n", msg,
1017 (int)le32_to_cpu(head->num),
1018 (int)CEPH_CAPS_PER_RELEASE,
1019 (int)msg->front.iov_len);
1020 }
1021 spin_unlock(&session->s_cap_lock);
1022}
1023
984/* 1024/*
985 * Queue cap releases when an inode is dropped from our cache. Since 1025 * Queue cap releases when an inode is dropped from our cache. Since
986 * inode is about to be destroyed, there is no need for i_lock. 1026 * inode is about to be destroyed, there is no need for i_lock.
@@ -994,41 +1034,9 @@ void ceph_queue_caps_release(struct inode *inode)
994 while (p) { 1034 while (p) {
995 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); 1035 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
996 struct ceph_mds_session *session = cap->session; 1036 struct ceph_mds_session *session = cap->session;
997 struct ceph_msg *msg;
998 struct ceph_mds_cap_release *head;
999 struct ceph_mds_cap_item *item;
1000 1037
1001 spin_lock(&session->s_cap_lock); 1038 __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
1002 BUG_ON(!session->s_num_cap_releases); 1039 cap->mseq, cap->issue_seq);
1003 msg = list_first_entry(&session->s_cap_releases,
1004 struct ceph_msg, list_head);
1005
1006 dout(" adding %p release to mds%d msg %p (%d left)\n",
1007 inode, session->s_mds, msg, session->s_num_cap_releases);
1008
1009 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1010 head = msg->front.iov_base;
1011 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1012 item = msg->front.iov_base + msg->front.iov_len;
1013 item->ino = cpu_to_le64(ceph_ino(inode));
1014 item->cap_id = cpu_to_le64(cap->cap_id);
1015 item->migrate_seq = cpu_to_le32(cap->mseq);
1016 item->seq = cpu_to_le32(cap->issue_seq);
1017
1018 session->s_num_cap_releases--;
1019
1020 msg->front.iov_len += sizeof(*item);
1021 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1022 dout(" release msg %p full\n", msg);
1023 list_move_tail(&msg->list_head,
1024 &session->s_cap_releases_done);
1025 } else {
1026 dout(" release msg %p at %d/%d (%d)\n", msg,
1027 (int)le32_to_cpu(head->num),
1028 (int)CEPH_CAPS_PER_RELEASE,
1029 (int)msg->front.iov_len);
1030 }
1031 spin_unlock(&session->s_cap_lock);
1032 p = rb_next(p); 1040 p = rb_next(p);
1033 __ceph_remove_cap(cap); 1041 __ceph_remove_cap(cap);
1034 } 1042 }
@@ -1776,9 +1784,9 @@ out:
1776 spin_unlock(&ci->i_unsafe_lock); 1784 spin_unlock(&ci->i_unsafe_lock);
1777} 1785}
1778 1786
1779int ceph_fsync(struct file *file, struct dentry *dentry, int datasync) 1787int ceph_fsync(struct file *file, int datasync)
1780{ 1788{
1781 struct inode *inode = dentry->d_inode; 1789 struct inode *inode = file->f_mapping->host;
1782 struct ceph_inode_info *ci = ceph_inode(inode); 1790 struct ceph_inode_info *ci = ceph_inode(inode);
1783 unsigned flush_tid; 1791 unsigned flush_tid;
1784 int ret; 1792 int ret;
@@ -2655,7 +2663,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2655 struct ceph_mds_caps *h; 2663 struct ceph_mds_caps *h;
2656 int mds = session->s_mds; 2664 int mds = session->s_mds;
2657 int op; 2665 int op;
2658 u32 seq; 2666 u32 seq, mseq;
2659 struct ceph_vino vino; 2667 struct ceph_vino vino;
2660 u64 cap_id; 2668 u64 cap_id;
2661 u64 size, max_size; 2669 u64 size, max_size;
@@ -2675,6 +2683,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2675 vino.snap = CEPH_NOSNAP; 2683 vino.snap = CEPH_NOSNAP;
2676 cap_id = le64_to_cpu(h->cap_id); 2684 cap_id = le64_to_cpu(h->cap_id);
2677 seq = le32_to_cpu(h->seq); 2685 seq = le32_to_cpu(h->seq);
2686 mseq = le32_to_cpu(h->migrate_seq);
2678 size = le64_to_cpu(h->size); 2687 size = le64_to_cpu(h->size);
2679 max_size = le64_to_cpu(h->max_size); 2688 max_size = le64_to_cpu(h->max_size);
2680 2689
@@ -2689,6 +2698,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2689 vino.snap, inode); 2698 vino.snap, inode);
2690 if (!inode) { 2699 if (!inode) {
2691 dout(" i don't have ino %llx\n", vino.ino); 2700 dout(" i don't have ino %llx\n", vino.ino);
2701
2702 if (op == CEPH_CAP_OP_IMPORT)
2703 __queue_cap_release(session, vino.ino, cap_id,
2704 mseq, seq);
2705
2706 /*
2707 * send any full release message to try to move things
2708 * along for the mds (who clearly thinks we still have this
2709 * cap).
2710 */
2711 ceph_add_cap_releases(mdsc, session, -1);
2712 ceph_send_cap_releases(mdsc, session);
2692 goto done; 2713 goto done;
2693 } 2714 }
2694 2715
@@ -2714,7 +2735,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2714 spin_lock(&inode->i_lock); 2735 spin_lock(&inode->i_lock);
2715 cap = __get_cap_for_mds(ceph_inode(inode), mds); 2736 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2716 if (!cap) { 2737 if (!cap) {
2717 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n", 2738 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2718 inode, ceph_ino(inode), ceph_snap(inode), mds); 2739 inode, ceph_ino(inode), ceph_snap(inode), mds);
2719 spin_unlock(&inode->i_lock); 2740 spin_unlock(&inode->i_lock);
2720 goto done; 2741 goto done;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 3b9eeed097b3..2fa992eaf7da 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -265,16 +265,17 @@ extern const char *ceph_mds_state_name(int s);
265 * - they also define the lock ordering by the MDS 265 * - they also define the lock ordering by the MDS
266 * - a few of these are internal to the mds 266 * - a few of these are internal to the mds
267 */ 267 */
268#define CEPH_LOCK_DN 1 268#define CEPH_LOCK_DVERSION 1
269#define CEPH_LOCK_ISNAP 2 269#define CEPH_LOCK_DN 2
270#define CEPH_LOCK_IVERSION 4 /* mds internal */ 270#define CEPH_LOCK_ISNAP 16
271#define CEPH_LOCK_IFILE 8 /* mds internal */ 271#define CEPH_LOCK_IVERSION 32 /* mds internal */
272#define CEPH_LOCK_IAUTH 32 272#define CEPH_LOCK_IFILE 64
273#define CEPH_LOCK_ILINK 64 273#define CEPH_LOCK_IAUTH 128
274#define CEPH_LOCK_IDFT 128 /* dir frag tree */ 274#define CEPH_LOCK_ILINK 256
275#define CEPH_LOCK_INEST 256 /* mds internal */ 275#define CEPH_LOCK_IDFT 512 /* dir frag tree */
276#define CEPH_LOCK_IXATTR 512 276#define CEPH_LOCK_INEST 1024 /* mds internal */
277#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */ 277#define CEPH_LOCK_IXATTR 2048
278#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
278 279
279/* client_session ops */ 280/* client_session ops */
280enum { 281enum {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4fd30900eff7..f85719310db2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -587,7 +587,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
587 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 587 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
588 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 588 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
589 if (IS_ERR(req)) 589 if (IS_ERR(req))
590 return ERR_PTR(PTR_ERR(req)); 590 return ERR_CAST(req);
591 req->r_dentry = dget(dentry); 591 req->r_dentry = dget(dentry);
592 req->r_num_caps = 2; 592 req->r_num_caps = 2;
593 /* we only need inode linkage */ 593 /* we only need inode linkage */
@@ -1107,10 +1107,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1107 * an fsync() on a dir will wait for any uncommitted directory 1107 * an fsync() on a dir will wait for any uncommitted directory
1108 * operations to commit. 1108 * operations to commit.
1109 */ 1109 */
1110static int ceph_dir_fsync(struct file *file, struct dentry *dentry, 1110static int ceph_dir_fsync(struct file *file, int datasync)
1111 int datasync)
1112{ 1111{
1113 struct inode *inode = dentry->d_inode; 1112 struct inode *inode = file->f_path.dentry->d_inode;
1114 struct ceph_inode_info *ci = ceph_inode(inode); 1113 struct ceph_inode_info *ci = ceph_inode(inode);
1115 struct list_head *head = &ci->i_unsafe_dirops; 1114 struct list_head *head = &ci->i_unsafe_dirops;
1116 struct ceph_mds_request *req; 1115 struct ceph_mds_request *req;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 17447644d675..4480cb1c63e7 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -133,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, 133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS); 134 USE_ANY_MDS);
135 if (IS_ERR(req)) 135 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req)); 136 return ERR_CAST(req);
137 137
138 req->r_ino1 = vino; 138 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino; 139 req->r_ino2.ino = cfh->parent_ino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6512b6701b9e..6251a1574b94 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
230 /* do the open */ 230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode); 231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req)) 232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req)); 233 return ERR_CAST(req);
234 req->r_dentry = dget(dentry); 234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2; 235 req->r_num_caps = 2;
236 if (flags & O_CREAT) { 236 if (flags & O_CREAT) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index a81b8b662c7b..ab47f46ca282 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
69 69
70 BUG_ON(!S_ISDIR(parent->i_mode)); 70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode)) 71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode)); 72 return inode;
73 inode->i_mode = parent->i_mode; 73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid; 74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid; 75 inode->i_gid = parent->i_gid;
@@ -827,7 +827,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
827 827
828 spin_lock(&dcache_lock); 828 spin_lock(&dcache_lock);
829 spin_lock(&dn->d_lock); 829 spin_lock(&dn->d_lock);
830 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); 830 list_move(&dn->d_u.d_child, &dir->d_subdirs);
831 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, 831 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
832 dn->d_u.d_child.prev, dn->d_u.d_child.next); 832 dn->d_u.d_child.prev, dn->d_u.d_child.next);
833 spin_unlock(&dn->d_lock); 833 spin_unlock(&dn->d_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 885aa5710cfd..1766947fc07a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1066,9 +1066,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1066 * 1066 *
1067 * Called under s_mutex. 1067 * Called under s_mutex.
1068 */ 1068 */
1069static int add_cap_releases(struct ceph_mds_client *mdsc, 1069int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1070 struct ceph_mds_session *session, 1070 struct ceph_mds_session *session,
1071 int extra) 1071 int extra)
1072{ 1072{
1073 struct ceph_msg *msg; 1073 struct ceph_msg *msg;
1074 struct ceph_mds_cap_release *head; 1074 struct ceph_mds_cap_release *head;
@@ -1176,8 +1176,8 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1176/* 1176/*
1177 * called under s_mutex 1177 * called under s_mutex
1178 */ 1178 */
1179static void send_cap_releases(struct ceph_mds_client *mdsc, 1179void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
1180 struct ceph_mds_session *session) 1180 struct ceph_mds_session *session)
1181{ 1181{
1182 struct ceph_msg *msg; 1182 struct ceph_msg *msg;
1183 1183
@@ -1768,12 +1768,12 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1768 mutex_unlock(&mdsc->mutex); 1768 mutex_unlock(&mdsc->mutex);
1769 dout("do_request waiting\n"); 1769 dout("do_request waiting\n");
1770 if (req->r_timeout) { 1770 if (req->r_timeout) {
1771 err = (long)wait_for_completion_interruptible_timeout( 1771 err = (long)wait_for_completion_killable_timeout(
1772 &req->r_completion, req->r_timeout); 1772 &req->r_completion, req->r_timeout);
1773 if (err == 0) 1773 if (err == 0)
1774 err = -EIO; 1774 err = -EIO;
1775 } else { 1775 } else {
1776 err = wait_for_completion_interruptible(&req->r_completion); 1776 err = wait_for_completion_killable(&req->r_completion);
1777 } 1777 }
1778 dout("do_request waited, got %d\n", err); 1778 dout("do_request waited, got %d\n", err);
1779 mutex_lock(&mdsc->mutex); 1779 mutex_lock(&mdsc->mutex);
@@ -1980,7 +1980,7 @@ out_err:
1980 } 1980 }
1981 mutex_unlock(&mdsc->mutex); 1981 mutex_unlock(&mdsc->mutex);
1982 1982
1983 add_cap_releases(mdsc, req->r_session, -1); 1983 ceph_add_cap_releases(mdsc, req->r_session, -1);
1984 mutex_unlock(&session->s_mutex); 1984 mutex_unlock(&session->s_mutex);
1985 1985
1986 /* kick calling process */ 1986 /* kick calling process */
@@ -2014,16 +2014,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
2014 mutex_lock(&mdsc->mutex); 2014 mutex_lock(&mdsc->mutex);
2015 req = __lookup_request(mdsc, tid); 2015 req = __lookup_request(mdsc, tid);
2016 if (!req) { 2016 if (!req) {
2017 dout("forward %llu to mds%d - req dne\n", tid, next_mds); 2017 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
2018 goto out; /* dup reply? */ 2018 goto out; /* dup reply? */
2019 } 2019 }
2020 2020
2021 if (fwd_seq <= req->r_num_fwd) { 2021 if (req->r_aborted) {
2022 dout("forward %llu to mds%d - old seq %d <= %d\n", 2022 dout("forward tid %llu aborted, unregistering\n", tid);
2023 __unregister_request(mdsc, req);
2024 } else if (fwd_seq <= req->r_num_fwd) {
2025 dout("forward tid %llu to mds%d - old seq %d <= %d\n",
2023 tid, next_mds, req->r_num_fwd, fwd_seq); 2026 tid, next_mds, req->r_num_fwd, fwd_seq);
2024 } else { 2027 } else {
2025 /* resend. forward race not possible; mds would drop */ 2028 /* resend. forward race not possible; mds would drop */
2026 dout("forward %llu to mds%d (we resend)\n", tid, next_mds); 2029 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
2030 BUG_ON(req->r_err);
2031 BUG_ON(req->r_got_result);
2027 req->r_num_fwd = fwd_seq; 2032 req->r_num_fwd = fwd_seq;
2028 req->r_resend_mds = next_mds; 2033 req->r_resend_mds = next_mds;
2029 put_request_session(req); 2034 put_request_session(req);
@@ -2428,6 +2433,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2428 struct ceph_dentry_info *di; 2433 struct ceph_dentry_info *di;
2429 int mds = session->s_mds; 2434 int mds = session->s_mds;
2430 struct ceph_mds_lease *h = msg->front.iov_base; 2435 struct ceph_mds_lease *h = msg->front.iov_base;
2436 u32 seq;
2431 struct ceph_vino vino; 2437 struct ceph_vino vino;
2432 int mask; 2438 int mask;
2433 struct qstr dname; 2439 struct qstr dname;
@@ -2441,6 +2447,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2441 vino.ino = le64_to_cpu(h->ino); 2447 vino.ino = le64_to_cpu(h->ino);
2442 vino.snap = CEPH_NOSNAP; 2448 vino.snap = CEPH_NOSNAP;
2443 mask = le16_to_cpu(h->mask); 2449 mask = le16_to_cpu(h->mask);
2450 seq = le32_to_cpu(h->seq);
2444 dname.name = (void *)h + sizeof(*h) + sizeof(u32); 2451 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2445 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); 2452 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2446 if (dname.len != get_unaligned_le32(h+1)) 2453 if (dname.len != get_unaligned_le32(h+1))
@@ -2451,8 +2458,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2451 2458
2452 /* lookup inode */ 2459 /* lookup inode */
2453 inode = ceph_find_inode(sb, vino); 2460 inode = ceph_find_inode(sb, vino);
2454 dout("handle_lease '%s', mask %d, ino %llx %p\n", 2461 dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
2455 ceph_lease_op_name(h->action), mask, vino.ino, inode); 2462 ceph_lease_op_name(h->action), mask, vino.ino, inode,
2463 dname.len, dname.name);
2456 if (inode == NULL) { 2464 if (inode == NULL) {
2457 dout("handle_lease no inode %llx\n", vino.ino); 2465 dout("handle_lease no inode %llx\n", vino.ino);
2458 goto release; 2466 goto release;
@@ -2477,7 +2485,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2477 switch (h->action) { 2485 switch (h->action) {
2478 case CEPH_MDS_LEASE_REVOKE: 2486 case CEPH_MDS_LEASE_REVOKE:
2479 if (di && di->lease_session == session) { 2487 if (di && di->lease_session == session) {
2480 h->seq = cpu_to_le32(di->lease_seq); 2488 if (ceph_seq_cmp(di->lease_seq, seq) > 0)
2489 h->seq = cpu_to_le32(di->lease_seq);
2481 __ceph_mdsc_drop_dentry_lease(dentry); 2490 __ceph_mdsc_drop_dentry_lease(dentry);
2482 } 2491 }
2483 release = 1; 2492 release = 1;
@@ -2491,7 +2500,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2491 unsigned long duration = 2500 unsigned long duration =
2492 le32_to_cpu(h->duration_ms) * HZ / 1000; 2501 le32_to_cpu(h->duration_ms) * HZ / 1000;
2493 2502
2494 di->lease_seq = le32_to_cpu(h->seq); 2503 di->lease_seq = seq;
2495 dentry->d_time = di->lease_renew_from + duration; 2504 dentry->d_time = di->lease_renew_from + duration;
2496 di->lease_renew_after = di->lease_renew_from + 2505 di->lease_renew_after = di->lease_renew_from +
2497 (duration >> 1); 2506 (duration >> 1);
@@ -2541,7 +2550,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2541 return; 2550 return;
2542 lease = msg->front.iov_base; 2551 lease = msg->front.iov_base;
2543 lease->action = action; 2552 lease->action = action;
2544 lease->mask = cpu_to_le16(CEPH_LOCK_DN); 2553 lease->mask = cpu_to_le16(1);
2545 lease->ino = cpu_to_le64(ceph_vino(inode).ino); 2554 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2546 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); 2555 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2547 lease->seq = cpu_to_le32(seq); 2556 lease->seq = cpu_to_le32(seq);
@@ -2571,7 +2580,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2571 2580
2572 BUG_ON(inode == NULL); 2581 BUG_ON(inode == NULL);
2573 BUG_ON(dentry == NULL); 2582 BUG_ON(dentry == NULL);
2574 BUG_ON(mask != CEPH_LOCK_DN); 2583 BUG_ON(mask == 0);
2575 2584
2576 /* is dentry lease valid? */ 2585 /* is dentry lease valid? */
2577 spin_lock(&dentry->d_lock); 2586 spin_lock(&dentry->d_lock);
@@ -2681,10 +2690,10 @@ static void delayed_work(struct work_struct *work)
2681 send_renew_caps(mdsc, s); 2690 send_renew_caps(mdsc, s);
2682 else 2691 else
2683 ceph_con_keepalive(&s->s_con); 2692 ceph_con_keepalive(&s->s_con);
2684 add_cap_releases(mdsc, s, -1); 2693 ceph_add_cap_releases(mdsc, s, -1);
2685 if (s->s_state == CEPH_MDS_SESSION_OPEN || 2694 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2686 s->s_state == CEPH_MDS_SESSION_HUNG) 2695 s->s_state == CEPH_MDS_SESSION_HUNG)
2687 send_cap_releases(mdsc, s); 2696 ceph_send_cap_releases(mdsc, s);
2688 mutex_unlock(&s->s_mutex); 2697 mutex_unlock(&s->s_mutex);
2689 ceph_put_mds_session(s); 2698 ceph_put_mds_session(s);
2690 2699
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d9936c4f1212..b292fa42a66d 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -322,6 +322,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
322 kref_put(&req->r_kref, ceph_mdsc_release_request); 322 kref_put(&req->r_kref, ceph_mdsc_release_request);
323} 323}
324 324
325extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
326 struct ceph_mds_session *session,
327 int extra);
328extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
329 struct ceph_mds_session *session);
330
325extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); 331extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
326 332
327extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, 333extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 60b74839ebec..64b8b1f7863d 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -120,6 +120,12 @@ void ceph_msgr_exit(void)
120 destroy_workqueue(ceph_msgr_wq); 120 destroy_workqueue(ceph_msgr_wq);
121} 121}
122 122
123void ceph_msgr_flush()
124{
125 flush_workqueue(ceph_msgr_wq);
126}
127
128
123/* 129/*
124 * socket callback functions 130 * socket callback functions
125 */ 131 */
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 00a9430b1ffc..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -213,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
213 213
214extern int ceph_msgr_init(void); 214extern int ceph_msgr_init(void);
215extern void ceph_msgr_exit(void); 215extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void);
216 217
217extern struct ceph_messenger *ceph_messenger_create( 218extern struct ceph_messenger *ceph_messenger_create(
218 struct ceph_entity_addr *myaddr); 219 struct ceph_entity_addr *myaddr);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index f6510a476e7e..07a539906e67 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -400,6 +400,8 @@ static void release_generic_request(struct kref *kref)
400 ceph_msg_put(req->reply); 400 ceph_msg_put(req->reply);
401 if (req->request) 401 if (req->request)
402 ceph_msg_put(req->request); 402 ceph_msg_put(req->request);
403
404 kfree(req);
403} 405}
404 406
405static void put_generic_request(struct ceph_mon_generic_request *req) 407static void put_generic_request(struct ceph_mon_generic_request *req)
@@ -704,8 +706,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
704 struct ceph_msg *msg) 706 struct ceph_msg *msg)
705{ 707{
706 int ret; 708 int ret;
709 int was_auth = 0;
707 710
708 mutex_lock(&monc->mutex); 711 mutex_lock(&monc->mutex);
712 if (monc->auth->ops)
713 was_auth = monc->auth->ops->is_authenticated(monc->auth);
709 monc->pending_auth = 0; 714 monc->pending_auth = 0;
710 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 715 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
711 msg->front.iov_len, 716 msg->front.iov_len,
@@ -716,7 +721,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
716 wake_up(&monc->client->auth_wq); 721 wake_up(&monc->client->auth_wq);
717 } else if (ret > 0) { 722 } else if (ret > 0) {
718 __send_prepared_auth_request(monc, ret); 723 __send_prepared_auth_request(monc, ret);
719 } else if (monc->auth->ops->is_authenticated(monc->auth)) { 724 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
720 dout("authenticated, starting session\n"); 725 dout("authenticated, starting session\n");
721 726
722 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 727 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index afa7bb3895c4..d25b4add85b4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
361{ 361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1); 363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref)) 364 if (atomic_dec_and_test(&osd->o_ref)) {
365 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
366
367 if (osd->o_authorizer)
368 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
365 kfree(osd); 369 kfree(osd);
370 }
366} 371}
367 372
368/* 373/*
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index cfdd8f4388b7..ddc656fb5c05 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -706,7 +706,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
706 len, *p, end); 706 len, *p, end);
707 newcrush = crush_decode(*p, min(*p+len, end)); 707 newcrush = crush_decode(*p, min(*p+len, end));
708 if (IS_ERR(newcrush)) 708 if (IS_ERR(newcrush))
709 return ERR_PTR(PTR_ERR(newcrush)); 709 return ERR_CAST(newcrush);
710 } 710 }
711 711
712 /* new flags? */ 712 /* new flags? */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7c663d9b9f81..fa87f51e38e1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -89,7 +89,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
89 89
90 buf->f_files = le64_to_cpu(st.num_objects); 90 buf->f_files = le64_to_cpu(st.num_objects);
91 buf->f_ffree = -1; 91 buf->f_ffree = -1;
92 buf->f_namelen = PATH_MAX; 92 buf->f_namelen = NAME_MAX;
93 buf->f_frsize = PAGE_CACHE_SIZE; 93 buf->f_frsize = PAGE_CACHE_SIZE;
94 94
95 /* leave fsid little-endian, regardless of host endianness */ 95 /* leave fsid little-endian, regardless of host endianness */
@@ -669,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client)
669 669
670 /* unmount */ 670 /* unmount */
671 ceph_mdsc_stop(&client->mdsc); 671 ceph_mdsc_stop(&client->mdsc);
672 ceph_monc_stop(&client->monc);
673 ceph_osdc_stop(&client->osdc); 672 ceph_osdc_stop(&client->osdc);
674 673
674 /*
675 * make sure mds and osd connections close out before destroying
676 * the auth module, which is needed to free those connections'
677 * ceph_authorizers.
678 */
679 ceph_msgr_flush();
680
681 ceph_monc_stop(&client->monc);
682
675 ceph_adjust_min_caps(-client->min_caps); 683 ceph_adjust_min_caps(-client->min_caps);
676 684
677 ceph_debugfs_client_cleanup(client); 685 ceph_debugfs_client_cleanup(client);
@@ -738,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
738 dout("open_root_inode opening '%s'\n", path); 746 dout("open_root_inode opening '%s'\n", path);
739 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); 747 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
740 if (IS_ERR(req)) 748 if (IS_ERR(req))
741 return ERR_PTR(PTR_ERR(req)); 749 return ERR_CAST(req);
742 req->r_path1 = kstrdup(path, GFP_NOFS); 750 req->r_path1 = kstrdup(path, GFP_NOFS);
743 req->r_ino1.ino = CEPH_INO_ROOT; 751 req->r_ino1.ino = CEPH_INO_ROOT;
744 req->r_ino1.snap = CEPH_NOSNAP; 752 req->r_ino1.snap = CEPH_NOSNAP;
@@ -918,7 +926,7 @@ static int ceph_compare_super(struct super_block *sb, void *data)
918/* 926/*
919 * construct our own bdi so we can control readahead, etc. 927 * construct our own bdi so we can control readahead, etc.
920 */ 928 */
921static atomic_long_t bdi_seq = ATOMIC_INIT(0); 929static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
922 930
923static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 931static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
924{ 932{
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3725c9ee9d08..10a4a406e887 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,7 +10,6 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mempool.h> 11#include <linux/mempool.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/slab.h>
14#include <linux/wait.h> 13#include <linux/wait.h>
15#include <linux/writeback.h> 14#include <linux/writeback.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
@@ -811,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap);
811 810
812extern void ceph_queue_caps_release(struct inode *inode); 811extern void ceph_queue_caps_release(struct inode *inode);
813extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); 812extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
814extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync); 813extern int ceph_fsync(struct file *file, int datasync);
815extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 814extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
816 struct ceph_mds_session *session); 815 struct ceph_mds_session *session);
817extern int ceph_get_cap_mds(struct inode *inode); 816extern int ceph_get_cap_mds(struct inode *inode);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0242ff9cbf41..a7eb65c84b1c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
85 size_t write_size, loff_t *poffset); 85 size_t write_size, loff_t *poffset);
86extern int cifs_lock(struct file *, int, struct file_lock *); 86extern int cifs_lock(struct file *, int, struct file_lock *);
87extern int cifs_fsync(struct file *, struct dentry *, int); 87extern int cifs_fsync(struct file *, int);
88extern int cifs_flush(struct file *, fl_owner_t id); 88extern int cifs_flush(struct file *, fl_owner_t id);
89extern int cifs_file_mmap(struct file * , struct vm_area_struct *); 89extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
90extern const struct file_operations cifs_dir_ops; 90extern const struct file_operations cifs_dir_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a83541ec9713..75541af4b3db 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1676,7 +1676,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1676 return rc; 1676 return rc;
1677} 1677}
1678 1678
1679int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1679int cifs_fsync(struct file *file, int datasync)
1680{ 1680{
1681 int xid; 1681 int xid;
1682 int rc = 0; 1682 int rc = 0;
@@ -1688,7 +1688,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1688 xid = GetXid(); 1688 xid = GetXid();
1689 1689
1690 cFYI(1, "Sync file - name: %s datasync: 0x%x", 1690 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1691 dentry->d_name.name, datasync); 1691 file->f_path.dentry->d_name.name, datasync);
1692 1692
1693 rc = filemap_write_and_wait(inode->i_mapping); 1693 rc = filemap_write_and_wait(inode->i_mapping);
1694 if (rc == 0) { 1694 if (rc == 0) {
@@ -1952,6 +1952,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
1952 bytes_read -= PAGE_CACHE_SIZE; 1952 bytes_read -= PAGE_CACHE_SIZE;
1953 continue; 1953 continue;
1954 } 1954 }
1955 page_cache_release(page);
1955 1956
1956 target = kmap_atomic(page, KM_USER0); 1957 target = kmap_atomic(page, KM_USER0);
1957 1958
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
11 11
12void coda_destroy_inodecache(void); 12void coda_destroy_inodecache(void);
13int coda_init_inodecache(void); 13int coda_init_inodecache(void);
14int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, 14int coda_fsync(struct file *coda_file, int datasync);
15 int datasync);
16void coda_sysctl_init(void); 15void coda_sysctl_init(void);
17void coda_sysctl_clean(void); 16void coda_sysctl_clean(void);
18 17
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7196077b1688..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
202 return 0; 202 return 0;
203} 203}
204 204
205int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) 205int coda_fsync(struct file *coda_file, int datasync)
206{ 206{
207 struct file *host_file; 207 struct file *host_file;
208 struct inode *coda_inode = coda_dentry->d_inode; 208 struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
209 struct coda_file_info *cfi; 209 struct coda_file_info *cfi;
210 int err = 0; 210 int err = 0;
211 211
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..6490d2134ff3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -568,6 +568,79 @@ out:
568 return ret; 568 return ret;
569} 569}
570 570
571/* A write operation does a read from user space and vice versa */
572#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
573
574ssize_t compat_rw_copy_check_uvector(int type,
575 const struct compat_iovec __user *uvector, unsigned long nr_segs,
576 unsigned long fast_segs, struct iovec *fast_pointer,
577 struct iovec **ret_pointer)
578{
579 compat_ssize_t tot_len;
580 struct iovec *iov = *ret_pointer = fast_pointer;
581 ssize_t ret = 0;
582 int seg;
583
584 /*
585 * SuS says "The readv() function *may* fail if the iovcnt argument
586 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
587 * traditionally returned zero for zero segments, so...
588 */
589 if (nr_segs == 0)
590 goto out;
591
592 ret = -EINVAL;
593 if (nr_segs > UIO_MAXIOV || nr_segs < 0)
594 goto out;
595 if (nr_segs > fast_segs) {
596 ret = -ENOMEM;
597 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
598 if (iov == NULL) {
599 *ret_pointer = fast_pointer;
600 goto out;
601 }
602 }
603 *ret_pointer = iov;
604
605 /*
606 * Single unix specification:
607 * We should -EINVAL if an element length is not >= 0 and fitting an
608 * ssize_t. The total length is fitting an ssize_t
609 *
610 * Be careful here because iov_len is a size_t not an ssize_t
611 */
612 tot_len = 0;
613 ret = -EINVAL;
614 for (seg = 0; seg < nr_segs; seg++) {
615 compat_ssize_t tmp = tot_len;
616 compat_uptr_t buf;
617 compat_ssize_t len;
618
619 if (__get_user(len, &uvector->iov_len) ||
620 __get_user(buf, &uvector->iov_base)) {
621 ret = -EFAULT;
622 goto out;
623 }
624 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
625 goto out;
626 tot_len += len;
627 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
628 goto out;
629 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
630 ret = -EFAULT;
631 goto out;
632 }
633 iov->iov_base = compat_ptr(buf);
634 iov->iov_len = (compat_size_t) len;
635 uvector++;
636 iov++;
637 }
638 ret = tot_len;
639
640out:
641 return ret;
642}
643
571static inline long 644static inline long
572copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) 645copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
573{ 646{
@@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
600 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); 673 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
601 ret = copy_iocb(nr, iocb, iocb64); 674 ret = copy_iocb(nr, iocb, iocb64);
602 if (!ret) 675 if (!ret)
603 ret = sys_io_submit(ctx_id, nr, iocb64); 676 ret = do_io_submit(ctx_id, nr, iocb64, 1);
604 return ret; 677 return ret;
605} 678}
606 679
@@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1077{ 1150{
1078 compat_ssize_t tot_len; 1151 compat_ssize_t tot_len;
1079 struct iovec iovstack[UIO_FASTIOV]; 1152 struct iovec iovstack[UIO_FASTIOV];
1080 struct iovec *iov=iovstack, *vector; 1153 struct iovec *iov;
1081 ssize_t ret; 1154 ssize_t ret;
1082 int seg;
1083 io_fn_t fn; 1155 io_fn_t fn;
1084 iov_fn_t fnv; 1156 iov_fn_t fnv;
1085 1157
1086 /*
1087 * SuS says "The readv() function *may* fail if the iovcnt argument
1088 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
1089 * traditionally returned zero for zero segments, so...
1090 */
1091 ret = 0;
1092 if (nr_segs == 0)
1093 goto out;
1094
1095 /*
1096 * First get the "struct iovec" from user memory and
1097 * verify all the pointers
1098 */
1099 ret = -EINVAL; 1158 ret = -EINVAL;
1100 if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
1101 goto out;
1102 if (!file->f_op) 1159 if (!file->f_op)
1103 goto out; 1160 goto out;
1104 if (nr_segs > UIO_FASTIOV) { 1161
1105 ret = -ENOMEM;
1106 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
1107 if (!iov)
1108 goto out;
1109 }
1110 ret = -EFAULT; 1162 ret = -EFAULT;
1111 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) 1163 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
1112 goto out; 1164 goto out;
1113 1165
1114 /* 1166 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1115 * Single unix specification: 1167 UIO_FASTIOV, iovstack, &iov);
1116 * We should -EINVAL if an element length is not >= 0 and fitting an
1117 * ssize_t. The total length is fitting an ssize_t
1118 *
1119 * Be careful here because iov_len is a size_t not an ssize_t
1120 */
1121 tot_len = 0;
1122 vector = iov;
1123 ret = -EINVAL;
1124 for (seg = 0 ; seg < nr_segs; seg++) {
1125 compat_ssize_t tmp = tot_len;
1126 compat_ssize_t len;
1127 compat_uptr_t buf;
1128
1129 if (__get_user(len, &uvector->iov_len) ||
1130 __get_user(buf, &uvector->iov_base)) {
1131 ret = -EFAULT;
1132 goto out;
1133 }
1134 if (len < 0) /* size_t not fitting an compat_ssize_t .. */
1135 goto out;
1136 tot_len += len;
1137 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
1138 goto out;
1139 vector->iov_base = compat_ptr(buf);
1140 vector->iov_len = (compat_size_t) len;
1141 uvector++;
1142 vector++;
1143 }
1144 if (tot_len == 0) { 1168 if (tot_len == 0) {
1145 ret = 0; 1169 ret = 0;
1146 goto out; 1170 goto out;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c8af2d91174b..cf78d44a8d6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -73,15 +73,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
73 return -EINVAL; 73 return -EINVAL;
74 74
75 sd_iattr = sd->s_iattr; 75 sd_iattr = sd->s_iattr;
76
77 error = inode_change_ok(inode, iattr);
78 if (error)
79 return error;
80
81 error = inode_setattr(inode, iattr);
82 if (error)
83 return error;
84
85 if (!sd_iattr) { 76 if (!sd_iattr) {
86 /* setting attributes for the first time, allocate now */ 77 /* setting attributes for the first time, allocate now */
87 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); 78 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
@@ -94,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
94 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; 85 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
95 sd->s_iattr = sd_iattr; 86 sd->s_iattr = sd_iattr;
96 } 87 }
97
98 /* attributes were changed atleast once in past */ 88 /* attributes were changed atleast once in past */
99 89
90 error = simple_setattr(dentry, iattr);
91 if (error)
92 return error;
93
100 if (ia_valid & ATTR_UID) 94 if (ia_valid & ATTR_UID)
101 sd_iattr->ia_uid = iattr->ia_uid; 95 sd_iattr->ia_uid = iattr->ia_uid;
102 if (ia_valid & ATTR_GID) 96 if (ia_valid & ATTR_GID)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); 277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); 278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
279 279
280DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
281
280/* 282/*
281 * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value 283 * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
282 * 284 *
283 * These functions are exactly the same as the above functions (but use a hex 285 * These functions are exactly the same as the above functions (but use a hex
284 * output for the decimal challenged). For details look at the above unsigned 286 * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
357} 359}
358EXPORT_SYMBOL_GPL(debugfs_create_x32); 360EXPORT_SYMBOL_GPL(debugfs_create_x32);
359 361
362/**
363 * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
364 * @name: a pointer to a string containing the name of the file to create.
365 * @mode: the permission that the file should have
366 * @parent: a pointer to the parent dentry for this file. This should be a
367 * directory dentry if set. If this parameter is %NULL, then the
368 * file will be created in the root of the debugfs filesystem.
369 * @value: a pointer to the variable that the file should read to and write
370 * from.
371 */
372struct dentry *debugfs_create_x64(const char *name, mode_t mode,
373 struct dentry *parent, u64 *value)
374{
375 return debugfs_create_file(name, mode, parent, value, &fops_x64);
376}
377EXPORT_SYMBOL_GPL(debugfs_create_x64);
378
360 379
361static int debugfs_size_t_set(void *data, u64 val) 380static int debugfs_size_t_set(void *data, u64 val)
362{ 381{
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..7600aacf531d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
82 int reap_counter; /* rate limit reaping */ 82 int reap_counter; /* rate limit reaping */
83 get_block_t *get_block; /* block mapping function */ 83 get_block_t *get_block; /* block mapping function */
84 dio_iodone_t *end_io; /* IO completion function */ 84 dio_iodone_t *end_io; /* IO completion function */
85 dio_submit_t *submit_io; /* IO submition function */
86 loff_t logical_offset_in_bio; /* current first logical block in bio */
85 sector_t final_block_in_bio; /* current final block in bio + 1 */ 87 sector_t final_block_in_bio; /* current final block in bio + 1 */
86 sector_t next_block_for_io; /* next block to be put under IO, 88 sector_t next_block_for_io; /* next block to be put under IO,
87 in dio_blocks units */ 89 in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
96 unsigned cur_page_offset; /* Offset into it, in bytes */ 98 unsigned cur_page_offset; /* Offset into it, in bytes */
97 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 99 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
98 sector_t cur_page_block; /* Where it starts */ 100 sector_t cur_page_block; /* Where it starts */
101 loff_t cur_page_fs_offset; /* Offset in file */
99 102
100 /* BIO completion state */ 103 /* BIO completion state */
101 spinlock_t bio_lock; /* protects BIO fields below */ 104 spinlock_t bio_lock; /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
300 spin_unlock_irqrestore(&dio->bio_lock, flags); 303 spin_unlock_irqrestore(&dio->bio_lock, flags);
301} 304}
302 305
306/**
307 * dio_end_io - handle the end io action for the given bio
308 * @bio: The direct io bio thats being completed
309 * @error: Error if there was one
310 *
311 * This is meant to be called by any filesystem that uses their own dio_submit_t
312 * so that the DIO specific endio actions are dealt with after the filesystem
313 * has done it's completion work.
314 */
315void dio_end_io(struct bio *bio, int error)
316{
317 struct dio *dio = bio->bi_private;
318
319 if (dio->is_async)
320 dio_bio_end_aio(bio, error);
321 else
322 dio_bio_end_io(bio, error);
323}
324EXPORT_SYMBOL_GPL(dio_end_io);
325
303static int 326static int
304dio_bio_alloc(struct dio *dio, struct block_device *bdev, 327dio_bio_alloc(struct dio *dio, struct block_device *bdev,
305 sector_t first_sector, int nr_vecs) 328 sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
316 bio->bi_end_io = dio_bio_end_io; 339 bio->bi_end_io = dio_bio_end_io;
317 340
318 dio->bio = bio; 341 dio->bio = bio;
342 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
319 return 0; 343 return 0;
320} 344}
321 345
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
340 if (dio->is_async && dio->rw == READ) 364 if (dio->is_async && dio->rw == READ)
341 bio_set_pages_dirty(bio); 365 bio_set_pages_dirty(bio);
342 366
343 submit_bio(dio->rw, bio); 367 if (dio->submit_io)
368 dio->submit_io(dio->rw, bio, dio->inode,
369 dio->logical_offset_in_bio);
370 else
371 submit_bio(dio->rw, bio);
344 372
345 dio->bio = NULL; 373 dio->bio = NULL;
346 dio->boundary = 0; 374 dio->boundary = 0;
375 dio->logical_offset_in_bio = 0;
347} 376}
348 377
349/* 378/*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
603 int ret = 0; 632 int ret = 0;
604 633
605 if (dio->bio) { 634 if (dio->bio) {
635 loff_t cur_offset = dio->block_in_file << dio->blkbits;
636 loff_t bio_next_offset = dio->logical_offset_in_bio +
637 dio->bio->bi_size;
638
606 /* 639 /*
607 * See whether this new request is contiguous with the old 640 * See whether this new request is contiguous with the old.
641 *
642 * Btrfs cannot handl having logically non-contiguous requests
643 * submitted. For exmple if you have
644 *
645 * Logical: [0-4095][HOLE][8192-12287]
646 * Phyiscal: [0-4095] [4096-8181]
647 *
648 * We cannot submit those pages together as one BIO. So if our
649 * current logical offset in the file does not equal what would
650 * be the next logical offset in the bio, submit the bio we
651 * have.
608 */ 652 */
609 if (dio->final_block_in_bio != dio->cur_page_block) 653 if (dio->final_block_in_bio != dio->cur_page_block ||
654 cur_offset != bio_next_offset)
610 dio_bio_submit(dio); 655 dio_bio_submit(dio);
611 /* 656 /*
612 * Submit now if the underlying fs is about to perform a 657 * Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
701 dio->cur_page_offset = offset; 746 dio->cur_page_offset = offset;
702 dio->cur_page_len = len; 747 dio->cur_page_len = len;
703 dio->cur_page_block = blocknr; 748 dio->cur_page_block = blocknr;
749 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
704out: 750out:
705 return ret; 751 return ret;
706} 752}
@@ -935,7 +981,7 @@ static ssize_t
935direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 981direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
936 const struct iovec *iov, loff_t offset, unsigned long nr_segs, 982 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
937 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, 983 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
938 struct dio *dio) 984 dio_submit_t submit_io, struct dio *dio)
939{ 985{
940 unsigned long user_addr; 986 unsigned long user_addr;
941 unsigned long flags; 987 unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
952 998
953 dio->get_block = get_block; 999 dio->get_block = get_block;
954 dio->end_io = end_io; 1000 dio->end_io = end_io;
1001 dio->submit_io = submit_io;
955 dio->final_block_in_bio = -1; 1002 dio->final_block_in_bio = -1;
956 dio->next_block_for_io = -1; 1003 dio->next_block_for_io = -1;
957 1004
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1008 } 1055 }
1009 } /* end iovec loop */ 1056 } /* end iovec loop */
1010 1057
1011 if (ret == -ENOTBLK && (rw & WRITE)) { 1058 if (ret == -ENOTBLK) {
1012 /* 1059 /*
1013 * The remaining part of the request will be 1060 * The remaining part of the request will be
1014 * be handled by buffered I/O when we return 1061 * be handled by buffered I/O when we return
@@ -1087,30 +1134,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1087 return ret; 1134 return ret;
1088} 1135}
1089 1136
1090/*
1091 * This is a library function for use by filesystem drivers.
1092 *
1093 * The locking rules are governed by the flags parameter:
1094 * - if the flags value contains DIO_LOCKING we use a fancy locking
1095 * scheme for dumb filesystems.
1096 * For writes this function is called under i_mutex and returns with
1097 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1098 * taken and dropped again before returning.
1099 * For reads and writes i_alloc_sem is taken in shared mode and released
1100 * on I/O completion (which may happen asynchronously after returning to
1101 * the caller).
1102 *
1103 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1104 * internal locking but rather rely on the filesystem to synchronize
1105 * direct I/O reads/writes versus each other and truncate.
1106 * For reads and writes both i_mutex and i_alloc_sem are not held on
1107 * entry and are never taken.
1108 */
1109ssize_t 1137ssize_t
1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1138__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
1111 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1139 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1140 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1113 int flags) 1141 dio_submit_t submit_io, int flags)
1114{ 1142{
1115 int seg; 1143 int seg;
1116 size_t size; 1144 size_t size;
@@ -1197,11 +1225,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1197 (end > i_size_read(inode))); 1225 (end > i_size_read(inode)));
1198 1226
1199 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1227 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1200 nr_segs, blkbits, get_block, end_io, dio); 1228 nr_segs, blkbits, get_block, end_io,
1229 submit_io, dio);
1230
1231out:
1232 return retval;
1233}
1234EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
1235
1236/*
1237 * This is a library function for use by filesystem drivers.
1238 *
1239 * The locking rules are governed by the flags parameter:
1240 * - if the flags value contains DIO_LOCKING we use a fancy locking
1241 * scheme for dumb filesystems.
1242 * For writes this function is called under i_mutex and returns with
1243 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1244 * taken and dropped again before returning.
1245 * For reads and writes i_alloc_sem is taken in shared mode and released
1246 * on I/O completion (which may happen asynchronously after returning to
1247 * the caller).
1248 *
1249 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1250 * internal locking but rather rely on the filesystem to synchronize
1251 * direct I/O reads/writes versus each other and truncate.
1252 * For reads and writes both i_mutex and i_alloc_sem are not held on
1253 * entry and are never taken.
1254 */
1255ssize_t
1256__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1257 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1258 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1259 dio_submit_t submit_io, int flags)
1260{
1261 ssize_t retval;
1201 1262
1263 retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
1264 offset, nr_segs, get_block, end_io, submit_io, flags);
1202 /* 1265 /*
1203 * In case of error extending write may have instantiated a few 1266 * In case of error extending write may have instantiated a few
1204 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1267 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1268 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
1269 * their own manner. This is a further example of where the old
1270 * truncate sequence is inadequate.
1205 * 1271 *
1206 * NOTE: filesystems with their own locking have to handle this 1272 * NOTE: filesystems with their own locking have to handle this
1207 * on their own. 1273 * on their own.
@@ -1209,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1209 if (flags & DIO_LOCKING) { 1275 if (flags & DIO_LOCKING) {
1210 if (unlikely((rw & WRITE) && retval < 0)) { 1276 if (unlikely((rw & WRITE) && retval < 0)) {
1211 loff_t isize = i_size_read(inode); 1277 loff_t isize = i_size_read(inode);
1278 loff_t end = offset + iov_length(iov, nr_segs);
1279
1212 if (end > isize) 1280 if (end > isize)
1213 vmtruncate(inode, isize); 1281 vmtruncate(inode, isize);
1214 } 1282 }
1215 } 1283 }
1216 1284
1217out:
1218 return retval; 1285 return retval;
1219} 1286}
1220EXPORT_SYMBOL(__blockdev_direct_IO); 1287EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 3bdddbcc785f..e8fcf4e2ed7d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -274,7 +274,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
274} 274}
275 275
276static int 276static int
277ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 277ecryptfs_fsync(struct file *file, int datasync)
278{ 278{
279 return vfs_fsync(ecryptfs_file_to_lower(file), datasync); 279 return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
280} 280}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 65dee2f336ae..31ef5252f0fe 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -805,7 +805,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
805 - (ia->ia_size & ~PAGE_CACHE_MASK)); 805 - (ia->ia_size & ~PAGE_CACHE_MASK));
806 806
807 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 807 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
808 rc = vmtruncate(inode, ia->ia_size); 808 rc = simple_setsize(inode, ia->ia_size);
809 if (rc) 809 if (rc)
810 goto out; 810 goto out;
811 lower_ia->ia_size = ia->ia_size; 811 lower_ia->ia_size = ia->ia_size;
@@ -830,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
830 goto out; 830 goto out;
831 } 831 }
832 } 832 }
833 vmtruncate(inode, ia->ia_size); 833 simple_setsize(inode, ia->ia_size);
834 rc = ecryptfs_write_inode_size_to_metadata(inode); 834 rc = ecryptfs_write_inode_size_to_metadata(inode);
835 if (rc) { 835 if (rc) {
836 printk(KERN_ERR "Problem with " 836 printk(KERN_ERR "Problem with "
diff --git a/fs/exec.c b/fs/exec.c
index 9badbc0bfb1d..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -768,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
768 struct signal_struct *sig = tsk->signal; 768 struct signal_struct *sig = tsk->signal;
769 struct sighand_struct *oldsighand = tsk->sighand; 769 struct sighand_struct *oldsighand = tsk->sighand;
770 spinlock_t *lock = &oldsighand->siglock; 770 spinlock_t *lock = &oldsighand->siglock;
771 int count;
772 771
773 if (thread_group_empty(tsk)) 772 if (thread_group_empty(tsk))
774 goto no_thread_group; 773 goto no_thread_group;
@@ -785,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
785 spin_unlock_irq(lock); 784 spin_unlock_irq(lock);
786 return -EAGAIN; 785 return -EAGAIN;
787 } 786 }
787
788 sig->group_exit_task = tsk; 788 sig->group_exit_task = tsk;
789 zap_other_threads(tsk); 789 sig->notify_count = zap_other_threads(tsk);
790 if (!thread_group_leader(tsk))
791 sig->notify_count--;
790 792
791 /* Account for the thread group leader hanging around: */ 793 while (sig->notify_count) {
792 count = thread_group_leader(tsk) ? 1 : 2;
793 sig->notify_count = count;
794 while (atomic_read(&sig->count) > count) {
795 __set_current_state(TASK_UNINTERRUPTIBLE); 794 __set_current_state(TASK_UNINTERRUPTIBLE);
796 spin_unlock_irq(lock); 795 spin_unlock_irq(lock);
797 schedule(); 796 schedule();
@@ -1662,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
1662 struct task_struct *tsk = current; 1661 struct task_struct *tsk = current;
1663 struct mm_struct *mm = tsk->mm; 1662 struct mm_struct *mm = tsk->mm;
1664 struct completion *vfork_done; 1663 struct completion *vfork_done;
1665 int core_waiters; 1664 int core_waiters = -EBUSY;
1666 1665
1667 init_completion(&core_state->startup); 1666 init_completion(&core_state->startup);
1668 core_state->dumper.task = tsk; 1667 core_state->dumper.task = tsk;
1669 core_state->dumper.next = NULL; 1668 core_state->dumper.next = NULL;
1670 core_waiters = zap_threads(tsk, mm, core_state, exit_code); 1669
1670 down_write(&mm->mmap_sem);
1671 if (!mm->core_state)
1672 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1671 up_write(&mm->mmap_sem); 1673 up_write(&mm->mmap_sem);
1672 1674
1673 if (unlikely(core_waiters < 0)) 1675 if (unlikely(core_waiters < 0))
@@ -1787,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
1787} 1789}
1788 1790
1789 1791
1792/*
1793 * uhm_pipe_setup
1794 * helper function to customize the process used
1795 * to collect the core in userspace. Specifically
1796 * it sets up a pipe and installs it as fd 0 (stdin)
1797 * for the process. Returns 0 on success, or
1798 * PTR_ERR on failure.
1799 * Note that it also sets the core limit to 1. This
1800 * is a special value that we use to trap recursive
1801 * core dumps
1802 */
1803static int umh_pipe_setup(struct subprocess_info *info)
1804{
1805 struct file *rp, *wp;
1806 struct fdtable *fdt;
1807 struct coredump_params *cp = (struct coredump_params *)info->data;
1808 struct files_struct *cf = current->files;
1809
1810 wp = create_write_pipe(0);
1811 if (IS_ERR(wp))
1812 return PTR_ERR(wp);
1813
1814 rp = create_read_pipe(wp, 0);
1815 if (IS_ERR(rp)) {
1816 free_write_pipe(wp);
1817 return PTR_ERR(rp);
1818 }
1819
1820 cp->file = wp;
1821
1822 sys_close(0);
1823 fd_install(0, rp);
1824 spin_lock(&cf->file_lock);
1825 fdt = files_fdtable(cf);
1826 FD_SET(0, fdt->open_fds);
1827 FD_CLR(0, fdt->close_on_exec);
1828 spin_unlock(&cf->file_lock);
1829
1830 /* and disallow core files too */
1831 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
1832
1833 return 0;
1834}
1835
1790void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1836void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1791{ 1837{
1792 struct core_state core_state; 1838 struct core_state core_state;
1793 char corename[CORENAME_MAX_SIZE + 1]; 1839 char corename[CORENAME_MAX_SIZE + 1];
1794 struct mm_struct *mm = current->mm; 1840 struct mm_struct *mm = current->mm;
1795 struct linux_binfmt * binfmt; 1841 struct linux_binfmt * binfmt;
1796 struct inode * inode;
1797 const struct cred *old_cred; 1842 const struct cred *old_cred;
1798 struct cred *cred; 1843 struct cred *cred;
1799 int retval = 0; 1844 int retval = 0;
1800 int flag = 0; 1845 int flag = 0;
1801 int ispipe = 0; 1846 int ispipe;
1802 char **helper_argv = NULL;
1803 int helper_argc = 0;
1804 int dump_count = 0;
1805 static atomic_t core_dump_count = ATOMIC_INIT(0); 1847 static atomic_t core_dump_count = ATOMIC_INIT(0);
1806 struct coredump_params cprm = { 1848 struct coredump_params cprm = {
1807 .signr = signr, 1849 .signr = signr,
@@ -1820,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1820 binfmt = mm->binfmt; 1862 binfmt = mm->binfmt;
1821 if (!binfmt || !binfmt->core_dump) 1863 if (!binfmt || !binfmt->core_dump)
1822 goto fail; 1864 goto fail;
1823 1865 if (!__get_dumpable(cprm.mm_flags))
1824 cred = prepare_creds();
1825 if (!cred) {
1826 retval = -ENOMEM;
1827 goto fail; 1866 goto fail;
1828 }
1829 1867
1830 down_write(&mm->mmap_sem); 1868 cred = prepare_creds();
1831 /* 1869 if (!cred)
1832 * If another thread got here first, or we are not dumpable, bail out.
1833 */
1834 if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1835 up_write(&mm->mmap_sem);
1836 put_cred(cred);
1837 goto fail; 1870 goto fail;
1838 }
1839
1840 /* 1871 /*
1841 * We cannot trust fsuid as being the "true" uid of the 1872 * We cannot trust fsuid as being the "true" uid of the
1842 * process nor do we know its entire history. We only know it 1873 * process nor do we know its entire history. We only know it
@@ -1849,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1849 } 1880 }
1850 1881
1851 retval = coredump_wait(exit_code, &core_state); 1882 retval = coredump_wait(exit_code, &core_state);
1852 if (retval < 0) { 1883 if (retval < 0)
1853 put_cred(cred); 1884 goto fail_creds;
1854 goto fail;
1855 }
1856 1885
1857 old_cred = override_creds(cred); 1886 old_cred = override_creds(cred);
1858 1887
@@ -1870,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1870 ispipe = format_corename(corename, signr); 1899 ispipe = format_corename(corename, signr);
1871 unlock_kernel(); 1900 unlock_kernel();
1872 1901
1873 if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
1874 goto fail_unlock;
1875
1876 if (ispipe) { 1902 if (ispipe) {
1877 if (cprm.limit == 0) { 1903 int dump_count;
1904 char **helper_argv;
1905
1906 if (cprm.limit == 1) {
1878 /* 1907 /*
1879 * Normally core limits are irrelevant to pipes, since 1908 * Normally core limits are irrelevant to pipes, since
1880 * we're not writing to the file system, but we use 1909 * we're not writing to the file system, but we use
1881 * cprm.limit of 0 here as a speacial value. Any 1910 * cprm.limit of 1 here as a speacial value. Any
1882 * non-zero limit gets set to RLIM_INFINITY below, but 1911 * non-1 limit gets set to RLIM_INFINITY below, but
1883 * a limit of 0 skips the dump. This is a consistent 1912 * a limit of 0 skips the dump. This is a consistent
1884 * way to catch recursive crashes. We can still crash 1913 * way to catch recursive crashes. We can still crash
1885 * if the core_pattern binary sets RLIM_CORE = !0 1914 * if the core_pattern binary sets RLIM_CORE = !1
1886 * but it runs as root, and can do lots of stupid things 1915 * but it runs as root, and can do lots of stupid things
1887 * Note that we use task_tgid_vnr here to grab the pid 1916 * Note that we use task_tgid_vnr here to grab the pid
1888 * of the process group leader. That way we get the 1917 * of the process group leader. That way we get the
@@ -1890,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1890 * core_pattern process dies. 1919 * core_pattern process dies.
1891 */ 1920 */
1892 printk(KERN_WARNING 1921 printk(KERN_WARNING
1893 "Process %d(%s) has RLIMIT_CORE set to 0\n", 1922 "Process %d(%s) has RLIMIT_CORE set to 1\n",
1894 task_tgid_vnr(current), current->comm); 1923 task_tgid_vnr(current), current->comm);
1895 printk(KERN_WARNING "Aborting core\n"); 1924 printk(KERN_WARNING "Aborting core\n");
1896 goto fail_unlock; 1925 goto fail_unlock;
1897 } 1926 }
1927 cprm.limit = RLIM_INFINITY;
1898 1928
1899 dump_count = atomic_inc_return(&core_dump_count); 1929 dump_count = atomic_inc_return(&core_dump_count);
1900 if (core_pipe_limit && (core_pipe_limit < dump_count)) { 1930 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1904,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1904 goto fail_dropcount; 1934 goto fail_dropcount;
1905 } 1935 }
1906 1936
1907 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1937 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
1908 if (!helper_argv) { 1938 if (!helper_argv) {
1909 printk(KERN_WARNING "%s failed to allocate memory\n", 1939 printk(KERN_WARNING "%s failed to allocate memory\n",
1910 __func__); 1940 __func__);
1911 goto fail_dropcount; 1941 goto fail_dropcount;
1912 } 1942 }
1913 1943
1914 cprm.limit = RLIM_INFINITY; 1944 retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
1915 1945 NULL, UMH_WAIT_EXEC, umh_pipe_setup,
1916 /* SIGPIPE can happen, but it's just never processed */ 1946 NULL, &cprm);
1917 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, 1947 argv_free(helper_argv);
1918 &cprm.file)) { 1948 if (retval) {
1919 printk(KERN_INFO "Core dump to %s pipe failed\n", 1949 printk(KERN_INFO "Core dump to %s pipe failed\n",
1920 corename); 1950 corename);
1921 goto fail_dropcount; 1951 goto close_fail;
1922 } 1952 }
1923 } else 1953 } else {
1954 struct inode *inode;
1955
1956 if (cprm.limit < binfmt->min_coredump)
1957 goto fail_unlock;
1958
1924 cprm.file = filp_open(corename, 1959 cprm.file = filp_open(corename,
1925 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1960 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1926 0600); 1961 0600);
1927 if (IS_ERR(cprm.file)) 1962 if (IS_ERR(cprm.file))
1928 goto fail_dropcount; 1963 goto fail_unlock;
1929 inode = cprm.file->f_path.dentry->d_inode;
1930 if (inode->i_nlink > 1)
1931 goto close_fail; /* multiple links - don't dump */
1932 if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
1933 goto close_fail;
1934
1935 /* AK: actually i see no reason to not allow this for named pipes etc.,
1936 but keep the previous behaviour for now. */
1937 if (!ispipe && !S_ISREG(inode->i_mode))
1938 goto close_fail;
1939 /*
1940 * Dont allow local users get cute and trick others to coredump
1941 * into their pre-created files:
1942 * Note, this is not relevant for pipes
1943 */
1944 if (!ispipe && (inode->i_uid != current_fsuid()))
1945 goto close_fail;
1946 if (!cprm.file->f_op)
1947 goto close_fail;
1948 if (!cprm.file->f_op->write)
1949 goto close_fail;
1950 if (!ispipe &&
1951 do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
1952 goto close_fail;
1953 1964
1954 retval = binfmt->core_dump(&cprm); 1965 inode = cprm.file->f_path.dentry->d_inode;
1966 if (inode->i_nlink > 1)
1967 goto close_fail;
1968 if (d_unhashed(cprm.file->f_path.dentry))
1969 goto close_fail;
1970 /*
1971 * AK: actually i see no reason to not allow this for named
1972 * pipes etc, but keep the previous behaviour for now.
1973 */
1974 if (!S_ISREG(inode->i_mode))
1975 goto close_fail;
1976 /*
1977 * Dont allow local users get cute and trick others to coredump
1978 * into their pre-created files.
1979 */
1980 if (inode->i_uid != current_fsuid())
1981 goto close_fail;
1982 if (!cprm.file->f_op || !cprm.file->f_op->write)
1983 goto close_fail;
1984 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
1985 goto close_fail;
1986 }
1955 1987
1988 retval = binfmt->core_dump(&cprm);
1956 if (retval) 1989 if (retval)
1957 current->signal->group_exit_code |= 0x80; 1990 current->signal->group_exit_code |= 0x80;
1958close_fail: 1991
1959 if (ispipe && core_pipe_limit) 1992 if (ispipe && core_pipe_limit)
1960 wait_for_dump_helpers(cprm.file); 1993 wait_for_dump_helpers(cprm.file);
1961 filp_close(cprm.file, NULL); 1994close_fail:
1995 if (cprm.file)
1996 filp_close(cprm.file, NULL);
1962fail_dropcount: 1997fail_dropcount:
1963 if (dump_count) 1998 if (ispipe)
1964 atomic_dec(&core_dump_count); 1999 atomic_dec(&core_dump_count);
1965fail_unlock: 2000fail_unlock:
1966 if (helper_argv) 2001 coredump_finish(mm);
1967 argv_free(helper_argv);
1968
1969 revert_creds(old_cred); 2002 revert_creds(old_cred);
2003fail_creds:
1970 put_cred(cred); 2004 put_cred(cred);
1971 coredump_finish(mm);
1972fail: 2005fail:
1973 return; 2006 return;
1974} 2007}
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..fef6899be397 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
40 return 0; 40 return 0;
41} 41}
42 42
43static int exofs_file_fsync(struct file *filp, struct dentry *dentry, 43static int exofs_file_fsync(struct file *filp, int datasync)
44 int datasync)
45{ 44{
46 int ret; 45 int ret;
47 struct address_space *mapping = filp->f_mapping; 46 struct address_space *mapping = filp->f_mapping;
48 struct inode *inode = dentry->d_inode; 47 struct inode *inode = mapping->host;
49 struct super_block *sb; 48 struct super_block *sb;
50 49
51 ret = filemap_write_and_wait(mapping); 50 ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
66 65
67static int exofs_flush(struct file *file, fl_owner_t id) 66static int exofs_flush(struct file *file, fl_owner_t id)
68{ 67{
69 exofs_file_fsync(file, file->f_path.dentry, 1); 68 exofs_file_fsync(file, 1);
70 /* TODO: Flush the OSD target */ 69 /* TODO: Flush the OSD target */
71 return 0; 70 return 0;
72} 71}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..52b34f1d2738 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_delete_inode (struct inode *);
123extern int ext2_sync_inode (struct inode *); 123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
125extern void ext2_truncate (struct inode *);
126extern int ext2_setattr (struct dentry *, struct iattr *); 125extern int ext2_setattr (struct dentry *, struct iattr *);
127extern void ext2_set_inode_flags(struct inode *inode); 126extern void ext2_set_inode_flags(struct inode *inode);
128extern void ext2_get_inode_flags(struct ext2_inode_info *); 127extern void ext2_get_inode_flags(struct ext2_inode_info *);
@@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *);
155extern const struct file_operations ext2_dir_operations; 154extern const struct file_operations ext2_dir_operations;
156 155
157/* file.c */ 156/* file.c */
158extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync); 157extern int ext2_fsync(struct file *file, int datasync);
159extern const struct inode_operations ext2_file_inode_operations; 158extern const struct inode_operations ext2_file_inode_operations;
160extern const struct file_operations ext2_file_operations; 159extern const struct file_operations ext2_file_operations;
161extern const struct file_operations ext2_xip_file_operations; 160extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
40 return 0; 40 return 0;
41} 41}
42 42
43int ext2_fsync(struct file *file, struct dentry *dentry, int datasync) 43int ext2_fsync(struct file *file, int datasync)
44{ 44{
45 int ret; 45 int ret;
46 struct super_block *sb = dentry->d_inode->i_sb; 46 struct super_block *sb = file->f_mapping->host->i_sb;
47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; 47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
48 48
49 ret = simple_fsync(file, dentry, datasync); 49 ret = generic_file_fsync(file, datasync);
50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { 50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
51 /* We don't really know where the IO error happened... */ 51 /* We don't really know where the IO error happened... */
52 ext2_error(sb, __func__, 52 ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
95#endif 95#endif
96 96
97const struct inode_operations ext2_file_inode_operations = { 97const struct inode_operations ext2_file_inode_operations = {
98 .truncate = ext2_truncate,
99#ifdef CONFIG_EXT2_FS_XATTR 98#ifdef CONFIG_EXT2_FS_XATTR
100 .setxattr = generic_setxattr, 99 .setxattr = generic_setxattr,
101 .getxattr = generic_getxattr, 100 .getxattr = generic_getxattr,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 527c46d9bc1f..3675088cb88c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -54,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
54 inode->i_blocks - ea_blocks == 0); 54 inode->i_blocks - ea_blocks == 0);
55} 55}
56 56
57static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
58
59static void ext2_write_failed(struct address_space *mapping, loff_t to)
60{
61 struct inode *inode = mapping->host;
62
63 if (to > inode->i_size) {
64 truncate_pagecache(inode, to, inode->i_size);
65 ext2_truncate_blocks(inode, inode->i_size);
66 }
67}
68
57/* 69/*
58 * Called at the last iput() if i_nlink is zero. 70 * Called at the last iput() if i_nlink is zero.
59 */ 71 */
@@ -71,7 +83,7 @@ void ext2_delete_inode (struct inode * inode)
71 83
72 inode->i_size = 0; 84 inode->i_size = 0;
73 if (inode->i_blocks) 85 if (inode->i_blocks)
74 ext2_truncate (inode); 86 ext2_truncate_blocks(inode, 0);
75 ext2_free_inode (inode); 87 ext2_free_inode (inode);
76 88
77 return; 89 return;
@@ -757,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping,
757 loff_t pos, unsigned len, unsigned flags, 769 loff_t pos, unsigned len, unsigned flags,
758 struct page **pagep, void **fsdata) 770 struct page **pagep, void **fsdata)
759{ 771{
760 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 772 return block_write_begin_newtrunc(file, mapping, pos, len, flags,
761 ext2_get_block); 773 pagep, fsdata, ext2_get_block);
762} 774}
763 775
764static int 776static int
@@ -766,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
766 loff_t pos, unsigned len, unsigned flags, 778 loff_t pos, unsigned len, unsigned flags,
767 struct page **pagep, void **fsdata) 779 struct page **pagep, void **fsdata)
768{ 780{
781 int ret;
782
769 *pagep = NULL; 783 *pagep = NULL;
770 return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata); 784 ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
785 if (ret < 0)
786 ext2_write_failed(mapping, pos + len);
787 return ret;
788}
789
790static int ext2_write_end(struct file *file, struct address_space *mapping,
791 loff_t pos, unsigned len, unsigned copied,
792 struct page *page, void *fsdata)
793{
794 int ret;
795
796 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
797 if (ret < len)
798 ext2_write_failed(mapping, pos + len);
799 return ret;
771} 800}
772 801
773static int 802static int
@@ -775,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
775 loff_t pos, unsigned len, unsigned flags, 804 loff_t pos, unsigned len, unsigned flags,
776 struct page **pagep, void **fsdata) 805 struct page **pagep, void **fsdata)
777{ 806{
807 int ret;
808
778 /* 809 /*
779 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework 810 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
780 * directory handling code to pass around offsets rather than struct 811 * directory handling code to pass around offsets rather than struct
781 * pages in order to make this work easily. 812 * pages in order to make this work easily.
782 */ 813 */
783 return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 814 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
784 ext2_get_block); 815 fsdata, ext2_get_block);
816 if (ret < 0)
817 ext2_write_failed(mapping, pos + len);
818 return ret;
785} 819}
786 820
787static int ext2_nobh_writepage(struct page *page, 821static int ext2_nobh_writepage(struct page *page,
@@ -800,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
800 loff_t offset, unsigned long nr_segs) 834 loff_t offset, unsigned long nr_segs)
801{ 835{
802 struct file *file = iocb->ki_filp; 836 struct file *file = iocb->ki_filp;
803 struct inode *inode = file->f_mapping->host; 837 struct address_space *mapping = file->f_mapping;
804 838 struct inode *inode = mapping->host;
805 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 839 ssize_t ret;
806 offset, nr_segs, ext2_get_block, NULL); 840
841 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
842 iov, offset, nr_segs, ext2_get_block, NULL);
843 if (ret < 0 && (rw & WRITE))
844 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
845 return ret;
807} 846}
808 847
809static int 848static int
@@ -818,7 +857,7 @@ const struct address_space_operations ext2_aops = {
818 .writepage = ext2_writepage, 857 .writepage = ext2_writepage,
819 .sync_page = block_sync_page, 858 .sync_page = block_sync_page,
820 .write_begin = ext2_write_begin, 859 .write_begin = ext2_write_begin,
821 .write_end = generic_write_end, 860 .write_end = ext2_write_end,
822 .bmap = ext2_bmap, 861 .bmap = ext2_bmap,
823 .direct_IO = ext2_direct_IO, 862 .direct_IO = ext2_direct_IO,
824 .writepages = ext2_writepages, 863 .writepages = ext2_writepages,
@@ -1027,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
1027 ext2_free_data(inode, p, q); 1066 ext2_free_data(inode, p, q);
1028} 1067}
1029 1068
1030void ext2_truncate(struct inode *inode) 1069static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
1031{ 1070{
1032 __le32 *i_data = EXT2_I(inode)->i_data; 1071 __le32 *i_data = EXT2_I(inode)->i_data;
1033 struct ext2_inode_info *ei = EXT2_I(inode); 1072 struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1039,27 +1078,8 @@ void ext2_truncate(struct inode *inode)
1039 int n; 1078 int n;
1040 long iblock; 1079 long iblock;
1041 unsigned blocksize; 1080 unsigned blocksize;
1042
1043 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1044 S_ISLNK(inode->i_mode)))
1045 return;
1046 if (ext2_inode_is_fast_symlink(inode))
1047 return;
1048 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1049 return;
1050
1051 blocksize = inode->i_sb->s_blocksize; 1081 blocksize = inode->i_sb->s_blocksize;
1052 iblock = (inode->i_size + blocksize-1) 1082 iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1053 >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1054
1055 if (mapping_is_xip(inode->i_mapping))
1056 xip_truncate_page(inode->i_mapping, inode->i_size);
1057 else if (test_opt(inode->i_sb, NOBH))
1058 nobh_truncate_page(inode->i_mapping,
1059 inode->i_size, ext2_get_block);
1060 else
1061 block_truncate_page(inode->i_mapping,
1062 inode->i_size, ext2_get_block);
1063 1083
1064 n = ext2_block_to_path(inode, iblock, offsets, NULL); 1084 n = ext2_block_to_path(inode, iblock, offsets, NULL);
1065 if (n == 0) 1085 if (n == 0)
@@ -1127,6 +1147,62 @@ do_indirects:
1127 ext2_discard_reservation(inode); 1147 ext2_discard_reservation(inode);
1128 1148
1129 mutex_unlock(&ei->truncate_mutex); 1149 mutex_unlock(&ei->truncate_mutex);
1150}
1151
1152static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
1153{
1154 /*
1155 * XXX: it seems like a bug here that we don't allow
1156 * IS_APPEND inode to have blocks-past-i_size trimmed off.
1157 * review and fix this.
1158 *
1159 * Also would be nice to be able to handle IO errors and such,
1160 * but that's probably too much to ask.
1161 */
1162 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1163 S_ISLNK(inode->i_mode)))
1164 return;
1165 if (ext2_inode_is_fast_symlink(inode))
1166 return;
1167 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1168 return;
1169 __ext2_truncate_blocks(inode, offset);
1170}
1171
1172int ext2_setsize(struct inode *inode, loff_t newsize)
1173{
1174 loff_t oldsize;
1175 int error;
1176
1177 error = inode_newsize_ok(inode, newsize);
1178 if (error)
1179 return error;
1180
1181 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1182 S_ISLNK(inode->i_mode)))
1183 return -EINVAL;
1184 if (ext2_inode_is_fast_symlink(inode))
1185 return -EINVAL;
1186 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1187 return -EPERM;
1188
1189 if (mapping_is_xip(inode->i_mapping))
1190 error = xip_truncate_page(inode->i_mapping, newsize);
1191 else if (test_opt(inode->i_sb, NOBH))
1192 error = nobh_truncate_page(inode->i_mapping,
1193 newsize, ext2_get_block);
1194 else
1195 error = block_truncate_page(inode->i_mapping,
1196 newsize, ext2_get_block);
1197 if (error)
1198 return error;
1199
1200 oldsize = inode->i_size;
1201 i_size_write(inode, newsize);
1202 truncate_pagecache(inode, oldsize, newsize);
1203
1204 __ext2_truncate_blocks(inode, newsize);
1205
1130 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1206 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1131 if (inode_needs_sync(inode)) { 1207 if (inode_needs_sync(inode)) {
1132 sync_mapping_buffers(inode->i_mapping); 1208 sync_mapping_buffers(inode->i_mapping);
@@ -1134,6 +1210,8 @@ do_indirects:
1134 } else { 1210 } else {
1135 mark_inode_dirty(inode); 1211 mark_inode_dirty(inode);
1136 } 1212 }
1213
1214 return 0;
1137} 1215}
1138 1216
1139static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, 1217static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1474,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1474 if (error) 1552 if (error)
1475 return error; 1553 return error;
1476 } 1554 }
1477 error = inode_setattr(inode, iattr); 1555 if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
1478 if (!error && (iattr->ia_valid & ATTR_MODE)) 1556 error = ext2_setsize(inode, iattr->ia_size);
1557 if (error)
1558 return error;
1559 }
1560 generic_setattr(inode, iattr);
1561 if (iattr->ia_valid & ATTR_MODE)
1479 error = ext2_acl_chmod(inode); 1562 error = ext2_acl_chmod(inode);
1563 mark_inode_dirty(inode);
1564
1480 return error; 1565 return error;
1481} 1566}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 71e9eb1fa696..7ff43f4a59cd 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -119,6 +119,8 @@ static void ext2_put_super (struct super_block * sb)
119 int i; 119 int i;
120 struct ext2_sb_info *sbi = EXT2_SB(sb); 120 struct ext2_sb_info *sbi = EXT2_SB(sb);
121 121
122 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
123
122 if (sb->s_dirt) 124 if (sb->s_dirt)
123 ext2_write_super(sb); 125 ext2_write_super(sb);
124 126
@@ -1063,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1063 sb->s_op = &ext2_sops; 1065 sb->s_op = &ext2_sops;
1064 sb->s_export_op = &ext2_export_ops; 1066 sb->s_export_op = &ext2_export_ops;
1065 sb->s_xattr = ext2_xattr_handlers; 1067 sb->s_xattr = ext2_xattr_handlers;
1068
1069#ifdef CONFIG_QUOTA
1070 sb->dq_op = &dquot_operations;
1071 sb->s_qcop = &dquot_quotactl_ops;
1072#endif
1073
1066 root = ext2_iget(sb, EXT2_ROOT_INO); 1074 root = ext2_iget(sb, EXT2_ROOT_INO);
1067 if (IS_ERR(root)) { 1075 if (IS_ERR(root)) {
1068 ret = PTR_ERR(root); 1076 ret = PTR_ERR(root);
@@ -1241,6 +1249,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1241 spin_unlock(&sbi->s_lock); 1249 spin_unlock(&sbi->s_lock);
1242 return 0; 1250 return 0;
1243 } 1251 }
1252
1244 /* 1253 /*
1245 * OK, we are remounting a valid rw partition rdonly, so set 1254 * OK, we are remounting a valid rw partition rdonly, so set
1246 * the rdonly flag and then mark the partition as valid again. 1255 * the rdonly flag and then mark the partition as valid again.
@@ -1248,6 +1257,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1248 es->s_state = cpu_to_le16(sbi->s_mount_state); 1257 es->s_state = cpu_to_le16(sbi->s_mount_state);
1249 es->s_mtime = cpu_to_le32(get_seconds()); 1258 es->s_mtime = cpu_to_le32(get_seconds());
1250 spin_unlock(&sbi->s_lock); 1259 spin_unlock(&sbi->s_lock);
1260
1261 err = dquot_suspend(sb, -1);
1262 if (err < 0) {
1263 spin_lock(&sbi->s_lock);
1264 goto restore_opts;
1265 }
1266
1251 ext2_sync_super(sb, es, 1); 1267 ext2_sync_super(sb, es, 1);
1252 } else { 1268 } else {
1253 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, 1269 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1269,8 +1285,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1269 if (!ext2_setup_super (sb, es, 0)) 1285 if (!ext2_setup_super (sb, es, 0))
1270 sb->s_flags &= ~MS_RDONLY; 1286 sb->s_flags &= ~MS_RDONLY;
1271 spin_unlock(&sbi->s_lock); 1287 spin_unlock(&sbi->s_lock);
1288
1272 ext2_write_super(sb); 1289 ext2_write_super(sb);
1290
1291 dquot_resume(sb, -1);
1273 } 1292 }
1293
1274 return 0; 1294 return 0;
1275restore_opts: 1295restore_opts:
1276 sbi->s_mount_opt = old_opts.s_mount_opt; 1296 sbi->s_mount_opt = old_opts.s_mount_opt;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
297 kfree (old); 297 kfree (old);
298 } 298 }
299 if (!parent) 299 if (!parent)
300 root->rb_node = NULL; 300 *root = RB_ROOT;
301 else if (parent->rb_left == n) 301 else if (parent->rb_left == n)
302 parent->rb_left = NULL; 302 parent->rb_left = NULL;
303 else if (parent->rb_right == n) 303 else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index fcf7487734b6..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,9 +43,9 @@
43 * inode to disk. 43 * inode to disk.
44 */ 44 */
45 45
46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext3_sync_file(struct file *file, int datasync)
47{ 47{
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = file->f_mapping->host;
49 struct ext3_inode_info *ei = EXT3_I(inode); 49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
51 int ret, needs_barrier = 0; 51 int ret, needs_barrier = 0;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0fc1293d0e96..6c953bb255e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
410 struct ext3_super_block *es = sbi->s_es; 410 struct ext3_super_block *es = sbi->s_es;
411 int i, err; 411 int i, err;
412 412
413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
414
413 lock_kernel(); 415 lock_kernel();
414 416
415 ext3_xattr_put_super(sb); 417 ext3_xattr_put_super(sb);
@@ -748,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot);
748static int ext3_mark_dquot_dirty(struct dquot *dquot); 750static int ext3_mark_dquot_dirty(struct dquot *dquot);
749static int ext3_write_info(struct super_block *sb, int type); 751static int ext3_write_info(struct super_block *sb, int type);
750static int ext3_quota_on(struct super_block *sb, int type, int format_id, 752static int ext3_quota_on(struct super_block *sb, int type, int format_id,
751 char *path, int remount); 753 char *path);
752static int ext3_quota_on_mount(struct super_block *sb, int type); 754static int ext3_quota_on_mount(struct super_block *sb, int type);
753static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 755static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
754 size_t len, loff_t off); 756 size_t len, loff_t off);
@@ -767,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = {
767 769
768static const struct quotactl_ops ext3_qctl_operations = { 770static const struct quotactl_ops ext3_qctl_operations = {
769 .quota_on = ext3_quota_on, 771 .quota_on = ext3_quota_on,
770 .quota_off = vfs_quota_off, 772 .quota_off = dquot_quota_off,
771 .quota_sync = vfs_quota_sync, 773 .quota_sync = dquot_quota_sync,
772 .get_info = vfs_get_dqinfo, 774 .get_info = dquot_get_dqinfo,
773 .set_info = vfs_set_dqinfo, 775 .set_info = dquot_set_dqinfo,
774 .get_dqblk = vfs_get_dqblk, 776 .get_dqblk = dquot_get_dqblk,
775 .set_dqblk = vfs_set_dqblk 777 .set_dqblk = dquot_set_dqblk
776}; 778};
777#endif 779#endif
778 780
@@ -1527,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1527 /* Turn quotas off */ 1529 /* Turn quotas off */
1528 for (i = 0; i < MAXQUOTAS; i++) { 1530 for (i = 0; i < MAXQUOTAS; i++) {
1529 if (sb_dqopt(sb)->files[i]) 1531 if (sb_dqopt(sb)->files[i])
1530 vfs_quota_off(sb, i, 0); 1532 dquot_quota_off(sb, i);
1531 } 1533 }
1532#endif 1534#endif
1533 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 1535 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2551,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2551 ext3_fsblk_t n_blocks_count = 0; 2553 ext3_fsblk_t n_blocks_count = 0;
2552 unsigned long old_sb_flags; 2554 unsigned long old_sb_flags;
2553 struct ext3_mount_options old_opts; 2555 struct ext3_mount_options old_opts;
2556 int enable_quota = 0;
2554 int err; 2557 int err;
2555#ifdef CONFIG_QUOTA 2558#ifdef CONFIG_QUOTA
2556 int i; 2559 int i;
@@ -2597,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2597 } 2600 }
2598 2601
2599 if (*flags & MS_RDONLY) { 2602 if (*flags & MS_RDONLY) {
2603 err = dquot_suspend(sb, -1);
2604 if (err < 0)
2605 goto restore_opts;
2606
2600 /* 2607 /*
2601 * First of all, the unconditional stuff we have to do 2608 * First of all, the unconditional stuff we have to do
2602 * to disable replay of the journal when we next remount 2609 * to disable replay of the journal when we next remount
@@ -2651,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2651 goto restore_opts; 2658 goto restore_opts;
2652 if (!ext3_setup_super (sb, es, 0)) 2659 if (!ext3_setup_super (sb, es, 0))
2653 sb->s_flags &= ~MS_RDONLY; 2660 sb->s_flags &= ~MS_RDONLY;
2661 enable_quota = 1;
2654 } 2662 }
2655 } 2663 }
2656#ifdef CONFIG_QUOTA 2664#ifdef CONFIG_QUOTA
@@ -2662,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2662#endif 2670#endif
2663 unlock_super(sb); 2671 unlock_super(sb);
2664 unlock_kernel(); 2672 unlock_kernel();
2673
2674 if (enable_quota)
2675 dquot_resume(sb, -1);
2665 return 0; 2676 return 0;
2666restore_opts: 2677restore_opts:
2667 sb->s_flags = old_sb_flags; 2678 sb->s_flags = old_sb_flags;
@@ -2851,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type)
2851 */ 2862 */
2852static int ext3_quota_on_mount(struct super_block *sb, int type) 2863static int ext3_quota_on_mount(struct super_block *sb, int type)
2853{ 2864{
2854 return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], 2865 return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
2855 EXT3_SB(sb)->s_jquota_fmt, type); 2866 EXT3_SB(sb)->s_jquota_fmt, type);
2856} 2867}
2857 2868
2858/* 2869/*
2859 * Standard function to be called on quota_on 2870 * Standard function to be called on quota_on
2860 */ 2871 */
2861static int ext3_quota_on(struct super_block *sb, int type, int format_id, 2872static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2862 char *name, int remount) 2873 char *name)
2863{ 2874{
2864 int err; 2875 int err;
2865 struct path path; 2876 struct path path;
2866 2877
2867 if (!test_opt(sb, QUOTA)) 2878 if (!test_opt(sb, QUOTA))
2868 return -EINVAL; 2879 return -EINVAL;
2869 /* When remounting, no checks are needed and in fact, name is NULL */
2870 if (remount)
2871 return vfs_quota_on(sb, type, format_id, name, remount);
2872 2880
2873 err = kern_path(name, LOOKUP_FOLLOW, &path); 2881 err = kern_path(name, LOOKUP_FOLLOW, &path);
2874 if (err) 2882 if (err)
@@ -2906,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2906 } 2914 }
2907 } 2915 }
2908 2916
2909 err = vfs_quota_on_path(sb, type, format_id, &path); 2917 err = dquot_quota_on_path(sb, type, format_id, &path);
2910 path_put(&path); 2918 path_put(&path);
2911 return err; 2919 return err;
2912} 2920}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 591 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 592 if (count)
593 *count = ar.len; 593 *count = ar.len;
594
595 /* 594 /*
596 * Account for the allocated meta blocks 595 * Account for the allocated meta blocks. We will never
596 * fail EDQUOT for metdata, but we do account for it.
597 */ 597 */
598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
602 dquot_alloc_block_nofail(inode, ar.len);
602 } 603 }
603 return ret; 604 return ret;
604} 605}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
72 else if (start_blk >= (entry->start_blk + entry->count)) 72 else if (start_blk >= (entry->start_blk + entry->count))
73 n = &(*n)->rb_right; 73 n = &(*n)->rb_right;
74 else { 74 else {
75 if (start_blk + count > (entry->start_blk + 75 if (start_blk + count > (entry->start_blk +
76 entry->count)) 76 entry->count))
77 entry->count = (start_blk + count - 77 entry->count = (start_blk + count -
78 entry->start_blk); 78 entry->start_blk);
79 new_node = *n; 79 new_node = *n;
80 new_entry = rb_entry(new_node, struct ext4_system_zone, 80 new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 __ext4_error(dir->i_sb, function, 86 ext4_error_inode(function, dir,
87 "bad entry in directory #%lu: %s - block=%llu" 87 "bad entry in directory: %s - block=%llu"
88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, 89 error_msg, (unsigned long long) bh->b_blocknr,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset, 90 (unsigned) (offset%bh->b_size), offset,
92 le32_to_cpu(de->inode), 91 le32_to_cpu(de->inode),
93 rlen, de->name_len); 92 rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
111 110
112 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 111 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT4_FEATURE_COMPAT_DIR_INDEX) && 112 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || 113 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) { 114 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext4_dx_readdir(filp, dirent, filldir); 115 err = ext4_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) { 116 if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
122 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
124 */ 123 */
125 EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; 124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
126 } 125 }
127 stored = 0; 126 stored = 0;
128 offset = filp->f_pos & (sb->s_blocksize - 1); 127 offset = filp->f_pos & (sb->s_blocksize - 1);
129 128
130 while (!error && !stored && filp->f_pos < inode->i_size) { 129 while (!error && !stored && filp->f_pos < inode->i_size) {
131 ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 130 struct ext4_map_blocks map;
132 struct buffer_head map_bh;
133 struct buffer_head *bh = NULL; 131 struct buffer_head *bh = NULL;
134 132
135 map_bh.b_state = 0; 133 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
136 err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); 134 map.m_len = 1;
135 err = ext4_map_blocks(NULL, inode, &map, 0);
137 if (err > 0) { 136 if (err > 0) {
138 pgoff_t index = map_bh.b_blocknr >> 137 pgoff_t index = map.m_pblk >>
139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 138 (PAGE_CACHE_SHIFT - inode->i_blkbits);
140 if (!ra_has_index(&filp->f_ra, index)) 139 if (!ra_has_index(&filp->f_ra, index))
141 page_cache_sync_readahead( 140 page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
143 &filp->f_ra, filp, 142 &filp->f_ra, filp,
144 index, 1); 143 index, 1);
145 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 144 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
146 bh = ext4_bread(NULL, inode, blk, 0, &err); 145 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
147 } 146 }
148 147
149 /* 148 /*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
152 */ 151 */
153 if (!bh) { 152 if (!bh) {
154 if (!dir_has_error) { 153 if (!dir_has_error) {
155 ext4_error(sb, "directory #%lu " 154 EXT4_ERROR_INODE(inode, "directory "
156 "contains a hole at offset %Lu", 155 "contains a hole at offset %Lu",
157 inode->i_ino,
158 (unsigned long long) filp->f_pos); 156 (unsigned long long) filp->f_pos);
159 dir_has_error = 1; 157 dir_has_error = 1;
160 } 158 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..19a4de57128a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/blockgroup_lock.h> 30#include <linux/blockgroup_lock.h>
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#ifdef __KERNEL__
33#include <linux/compat.h>
34#endif
32 35
33/* 36/*
34 * The fourth extended filesystem constants/structures 37 * The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
54#endif 57#endif
55 58
56#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a); 60 ext4_error_inode(__func__, (inode), (fmt), ## a)
58 61
59#define EXT4_ERROR_FILE(file, fmt, a...) \ 62#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a); 63 ext4_error_file(__func__, (file), (fmt), ## a)
61 64
62/* data type for block offset of block group */ 65/* data type for block offset of block group */
63typedef int ext4_grpblk_t; 66typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
72typedef unsigned int ext4_group_t; 75typedef unsigned int ext4_group_t;
73 76
74/* 77/*
75 * Flags used in mballoc's allocation_context flags field. 78 * Flags used in mballoc's allocation_context flags field.
76 * 79 *
77 * Also used to show what's going on for debugging purposes when the 80 * Also used to show what's going on for debugging purposes when the
78 * flag field is exported via the traceport interface 81 * flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
126}; 129};
127 130
128/* 131/*
132 * Logical to physical block mapping, used by ext4_map_blocks()
133 *
134 * This structure is used to pass requests into ext4_map_blocks() as
135 * well as to store the information returned by ext4_map_blocks(). It
136 * takes less room on the stack than a struct buffer_head.
137 */
138#define EXT4_MAP_NEW (1 << BH_New)
139#define EXT4_MAP_MAPPED (1 << BH_Mapped)
140#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
141#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
142#define EXT4_MAP_UNINIT (1 << BH_Uninit)
143#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
144 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
145 EXT4_MAP_UNINIT)
146
147struct ext4_map_blocks {
148 ext4_fsblk_t m_pblk;
149 ext4_lblk_t m_lblk;
150 unsigned int m_len;
151 unsigned int m_flags;
152};
153
154/*
129 * For delayed allocation tracking 155 * For delayed allocation tracking
130 */ 156 */
131struct mpage_da_data { 157struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
321 return flags & EXT4_OTHER_FLMASK; 347 return flags & EXT4_OTHER_FLMASK;
322} 348}
323 349
350/*
351 * Inode flags used for atomic set/get
352 */
353enum {
354 EXT4_INODE_SECRM = 0, /* Secure deletion */
355 EXT4_INODE_UNRM = 1, /* Undelete */
356 EXT4_INODE_COMPR = 2, /* Compress file */
357 EXT4_INODE_SYNC = 3, /* Synchronous updates */
358 EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
359 EXT4_INODE_APPEND = 5, /* writes to file may only append */
360 EXT4_INODE_NODUMP = 6, /* do not dump file */
361 EXT4_INODE_NOATIME = 7, /* do not update atime */
362/* Reserved for compression usage... */
363 EXT4_INODE_DIRTY = 8,
364 EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
365 EXT4_INODE_NOCOMPR = 10, /* Don't compress */
366 EXT4_INODE_ECOMPR = 11, /* Compression error */
367/* End compression flags --- maybe not all used */
368 EXT4_INODE_INDEX = 12, /* hash-indexed directory */
369 EXT4_INODE_IMAGIC = 13, /* AFS directory */
370 EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
371 EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
372 EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
373 EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
374 EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
375 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
376 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
377 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
378 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
379};
380
381#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
382#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
383 printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
384 EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
385
386/*
387 * Since it's pretty easy to mix up bit numbers and hex values, and we
388 * can't do a compile-time test for ENUM values, we use a run-time
389 * test to make sure that EXT4_XXX_FL is consistent with respect to
390 * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
391 * out so it won't cost any extra space in the compiled kernel image.
392 * But it's important that these values are the same, since we are
393 * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
394 * must be consistent with the values of FS_XXX_FL defined in
395 * include/linux/fs.h and the on-disk values found in ext2, ext3, and
396 * ext4 filesystems, and of course the values defined in e2fsprogs.
397 *
398 * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
399 */
400static inline void ext4_check_flag_values(void)
401{
402 CHECK_FLAG_VALUE(SECRM);
403 CHECK_FLAG_VALUE(UNRM);
404 CHECK_FLAG_VALUE(COMPR);
405 CHECK_FLAG_VALUE(SYNC);
406 CHECK_FLAG_VALUE(IMMUTABLE);
407 CHECK_FLAG_VALUE(APPEND);
408 CHECK_FLAG_VALUE(NODUMP);
409 CHECK_FLAG_VALUE(NOATIME);
410 CHECK_FLAG_VALUE(DIRTY);
411 CHECK_FLAG_VALUE(COMPRBLK);
412 CHECK_FLAG_VALUE(NOCOMPR);
413 CHECK_FLAG_VALUE(ECOMPR);
414 CHECK_FLAG_VALUE(INDEX);
415 CHECK_FLAG_VALUE(IMAGIC);
416 CHECK_FLAG_VALUE(JOURNAL_DATA);
417 CHECK_FLAG_VALUE(NOTAIL);
418 CHECK_FLAG_VALUE(DIRSYNC);
419 CHECK_FLAG_VALUE(TOPDIR);
420 CHECK_FLAG_VALUE(HUGE_FILE);
421 CHECK_FLAG_VALUE(EXTENTS);
422 CHECK_FLAG_VALUE(EA_INODE);
423 CHECK_FLAG_VALUE(EOFBLOCKS);
424 CHECK_FLAG_VALUE(RESERVED);
425}
426
324/* Used to pass group descriptor data when online resize is done */ 427/* Used to pass group descriptor data when online resize is done */
325struct ext4_new_group_input { 428struct ext4_new_group_input {
326 __u32 group; /* Group number for this data */ 429 __u32 group; /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
332 __u16 unused; 435 __u16 unused;
333}; 436};
334 437
438#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
439struct compat_ext4_new_group_input {
440 u32 group;
441 compat_u64 block_bitmap;
442 compat_u64 inode_bitmap;
443 compat_u64 inode_table;
444 u32 blocks_count;
445 u16 reserved_blocks;
446 u16 unused;
447};
448#endif
449
335/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ 450/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
336struct ext4_new_group_data { 451struct ext4_new_group_data {
337 __u32 group; 452 __u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
355#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ 470#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
356 EXT4_GET_BLOCKS_CREATE) 471 EXT4_GET_BLOCKS_CREATE)
357 /* Caller is from the delayed allocation writeout path, 472 /* Caller is from the delayed allocation writeout path,
358 so set the magic i_delalloc_reserve_flag after taking the 473 so set the magic i_delalloc_reserve_flag after taking the
359 inode allocation semaphore for */ 474 inode allocation semaphore for */
360#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 475#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
361 /* caller is from the direct IO path, request to creation of an 476 /* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
398#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 513#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
399#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 514#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
400 515
516#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
401/* 517/*
402 * ioctl commands in 32 bit emulation 518 * ioctl commands in 32 bit emulation
403 */ 519 */
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
408#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) 524#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
409#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) 525#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
410#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) 526#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
527#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
411#ifdef CONFIG_JBD2_DEBUG 528#ifdef CONFIG_JBD2_DEBUG
412#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) 529#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
413#endif 530#endif
414#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 531#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
415#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 532#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
533#endif
416 534
417 535
418/* 536/*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
616 */ 734 */
617struct ext4_inode_info { 735struct ext4_inode_info {
618 __le32 i_data[15]; /* unconverted */ 736 __le32 i_data[15]; /* unconverted */
619 __u32 i_flags;
620 ext4_fsblk_t i_file_acl;
621 __u32 i_dtime; 737 __u32 i_dtime;
738 ext4_fsblk_t i_file_acl;
622 739
623 /* 740 /*
624 * i_block_group is the number of the block group which contains 741 * i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
629 */ 746 */
630 ext4_group_t i_block_group; 747 ext4_group_t i_block_group;
631 unsigned long i_state_flags; /* Dynamic state flags */ 748 unsigned long i_state_flags; /* Dynamic state flags */
749 unsigned long i_flags;
632 750
633 ext4_lblk_t i_dir_start_lookup; 751 ext4_lblk_t i_dir_start_lookup;
634#ifdef CONFIG_EXT4_FS_XATTR 752#ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ 1180 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1181 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1182 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1183 EXT4_STATE_NEWENTRY, /* File just added to dir */
1065}; 1184};
1066 1185
1067static inline int ext4_test_inode_state(struct inode *inode, int bit) 1186#define EXT4_INODE_BIT_FNS(name, field) \
1068{ 1187static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags); 1188{ \
1070} 1189 return test_bit(bit, &EXT4_I(inode)->i_##field); \
1071 1190} \
1072static inline void ext4_set_inode_state(struct inode *inode, int bit) 1191static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1073{ 1192{ \
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags); 1193 set_bit(bit, &EXT4_I(inode)->i_##field); \
1194} \
1195static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1196{ \
1197 clear_bit(bit, &EXT4_I(inode)->i_##field); \
1075} 1198}
1076 1199
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit) 1200EXT4_INODE_BIT_FNS(flag, flags)
1078{ 1201EXT4_INODE_BIT_FNS(state, state_flags)
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1081#else 1202#else
1082/* Assume that user mode programs are passing in an ext4fs superblock, not 1203/* Assume that user mode programs are passing in an ext4fs superblock, not
1083 * a kernel struct super_block. This will allow us to call the feature-test 1204 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
1264 1385
1265#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ 1386#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
1266 EXT4_FEATURE_COMPAT_DIR_INDEX) && \ 1387 EXT4_FEATURE_COMPAT_DIR_INDEX) && \
1267 (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) 1388 ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
1268#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) 1389#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
1269#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) 1390#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
1270 1391
@@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1398extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1519extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1399 1520
1400/* fsync.c */ 1521/* fsync.c */
1401extern int ext4_sync_file(struct file *, struct dentry *, int); 1522extern int ext4_sync_file(struct file *, int);
1402 1523
1403/* hash.c */ 1524/* hash.c */
1404extern int ext4fs_dirhash(const char *name, int len, struct 1525extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
1678 ext4_grpblk_t bb_first_free; /* first free block */ 1799 ext4_grpblk_t bb_first_free; /* first free block */
1679 ext4_grpblk_t bb_free; /* total free blocks */ 1800 ext4_grpblk_t bb_free; /* total free blocks */
1680 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 1801 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1802 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
1681 struct list_head bb_prealloc_list; 1803 struct list_head bb_prealloc_list;
1682#ifdef DOUBLE_CHECK 1804#ifdef DOUBLE_CHECK
1683 void *bb_bitmap; 1805 void *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1772extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1894extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1773extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1895extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1774 int chunk); 1896 int chunk);
1775extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1897extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
1776 ext4_lblk_t iblock, unsigned int max_blocks, 1898 struct ext4_map_blocks *map, int flags);
1777 struct buffer_head *bh_result, int flags);
1778extern void ext4_ext_truncate(struct inode *); 1899extern void ext4_ext_truncate(struct inode *);
1779extern void ext4_ext_init(struct super_block *); 1900extern void ext4_ext_init(struct super_block *);
1780extern void ext4_ext_release(struct super_block *); 1901extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1782 loff_t len); 1903 loff_t len);
1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1904extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1784 ssize_t len); 1905 ssize_t len);
1906extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
1907 struct ext4_map_blocks *map, int flags);
1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1908extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1786 sector_t block, unsigned int max_blocks, 1909 sector_t block, unsigned int max_blocks,
1787 struct buffer_head *bh, int flags); 1910 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
273 return 1; 273 return 1;
274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
275 return 1; 275 return 1;
276 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 276 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
277 return 1; 277 return 1;
278 return 0; 278 return 0;
279} 279}
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
284 return 0; 284 return 0;
285 if (!S_ISREG(inode->i_mode)) 285 if (!S_ISREG(inode->i_mode))
286 return 0; 286 return 0;
287 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 287 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
288 return 0; 288 return 0;
289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
290 return 1; 290 return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
297 return 0; 297 return 0;
298 if (EXT4_JOURNAL(inode) == NULL) 298 if (EXT4_JOURNAL(inode) == NULL)
299 return 1; 299 return 1;
300 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 300 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
301 return 0; 301 return 0;
302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
303 return 1; 303 return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
321 return 0; 321 return 0;
322 if (!S_ISREG(inode->i_mode)) 322 if (!S_ISREG(inode->i_mode))
323 return 0; 323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 324 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
325 return 0; 325 return 0;
326 if (ext4_should_journal_data(inode)) 326 if (ext4_should_journal_data(inode))
327 return 0; 327 return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca8..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
107 if (err <= 0) 107 if (err <= 0)
108 return err; 108 return err;
109 err = ext4_truncate_restart_trans(handle, inode, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /* 110 if (err == 0)
111 * We have dropped i_data_sem so someone might have cached again 111 err = -EAGAIN;
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115 112
116 return err; 113 return err;
117} 114}
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
185 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 182 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
186 /* 183 /*
187 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 184 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
188 * block groups per flexgroup, reserve the first block 185 * block groups per flexgroup, reserve the first block
189 * group for directories and special files. Regular 186 * group for directories and special files. Regular
190 * files will start at the second block group. This 187 * files will start at the second block group. This
191 * tends to speed up directory access and improves 188 * tends to speed up directory access and improves
192 * fsck times. 189 * fsck times.
193 */ 190 */
194 block_group &= ~(flex_size-1); 191 block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
439 return 0; 436 return 0;
440 437
441corrupted: 438corrupted:
442 __ext4_error(inode->i_sb, function, 439 ext4_error_inode(function, inode,
443 "bad header/extent in inode #%lu: %s - magic %x, " 440 "bad header/extent: %s - magic %x, "
444 "entries %u, max %u(%u), depth %u(%u)", 441 "entries %u, max %u(%u), depth %u(%u)",
445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 442 error_msg, le16_to_cpu(eh->eh_magic),
446 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 443 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
447 max, le16_to_cpu(eh->eh_depth), depth); 444 max, le16_to_cpu(eh->eh_depth), depth);
448 445
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
1622 merge_done = 1; 1619 merge_done = 1;
1623 WARN_ON(eh->eh_entries == 0); 1620 WARN_ON(eh->eh_entries == 0);
1624 if (!eh->eh_entries) 1621 if (!eh->eh_entries)
1625 ext4_error(inode->i_sb, 1622 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1628 } 1623 }
1629 1624
1630 return merge_done; 1625 return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2039 struct ext4_ext_cache *cex; 2034 struct ext4_ext_cache *cex;
2040 int ret = EXT4_EXT_CACHE_NO; 2035 int ret = EXT4_EXT_CACHE_NO;
2041 2036
2042 /* 2037 /*
2043 * We borrow i_block_reservation_lock to protect i_cached_extent 2038 * We borrow i_block_reservation_lock to protect i_cached_extent
2044 */ 2039 */
2045 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2040 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2361 int depth = ext_depth(inode); 2356 int depth = ext_depth(inode);
2362 struct ext4_ext_path *path; 2357 struct ext4_ext_path *path;
2363 handle_t *handle; 2358 handle_t *handle;
2364 int i = 0, err = 0; 2359 int i, err;
2365 2360
2366 ext_debug("truncate since %u\n", start); 2361 ext_debug("truncate since %u\n", start);
2367 2362
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2370 if (IS_ERR(handle)) 2365 if (IS_ERR(handle))
2371 return PTR_ERR(handle); 2366 return PTR_ERR(handle);
2372 2367
2368again:
2373 ext4_ext_invalidate_cache(inode); 2369 ext4_ext_invalidate_cache(inode);
2374 2370
2375 /* 2371 /*
2376 * We start scanning from right side, freeing all the blocks 2372 * We start scanning from right side, freeing all the blocks
2377 * after i_size and walking into the tree depth-wise. 2373 * after i_size and walking into the tree depth-wise.
2378 */ 2374 */
2375 depth = ext_depth(inode);
2379 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2376 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2380 if (path == NULL) { 2377 if (path == NULL) {
2381 ext4_journal_stop(handle); 2378 ext4_journal_stop(handle);
2382 return -ENOMEM; 2379 return -ENOMEM;
2383 } 2380 }
2381 path[0].p_depth = depth;
2384 path[0].p_hdr = ext_inode_hdr(inode); 2382 path[0].p_hdr = ext_inode_hdr(inode);
2385 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2383 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2386 err = -EIO; 2384 err = -EIO;
2387 goto out; 2385 goto out;
2388 } 2386 }
2389 path[0].p_depth = depth; 2387 i = err = 0;
2390 2388
2391 while (i >= 0 && err == 0) { 2389 while (i >= 0 && err == 0) {
2392 if (i == depth) { 2390 if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2480out: 2478out:
2481 ext4_ext_drop_refs(path); 2479 ext4_ext_drop_refs(path);
2482 kfree(path); 2480 kfree(path);
2481 if (err == -EAGAIN)
2482 goto again;
2483 ext4_journal_stop(handle); 2483 ext4_journal_stop(handle);
2484 2484
2485 return err; 2485 return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
2544/* FIXME!! we need to try to merge to left or right after zero-out */ 2544/* FIXME!! we need to try to merge to left or right after zero-out */
2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2546{ 2546{
2547 int ret = -EIO; 2547 int ret;
2548 struct bio *bio; 2548 struct bio *bio;
2549 int blkbits, blocksize; 2549 int blkbits, blocksize;
2550 sector_t ee_pblock; 2550 sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2568 len = ee_len; 2568 len = ee_len;
2569 2569
2570 bio = bio_alloc(GFP_NOIO, len); 2570 bio = bio_alloc(GFP_NOIO, len);
2571 if (!bio)
2572 return -ENOMEM;
2573
2571 bio->bi_sector = ee_pblock; 2574 bio->bi_sector = ee_pblock;
2572 bio->bi_bdev = inode->i_sb->s_bdev; 2575 bio->bi_bdev = inode->i_sb->s_bdev;
2573 2576
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2595 submit_bio(WRITE, bio); 2598 submit_bio(WRITE, bio);
2596 wait_for_completion(&event); 2599 wait_for_completion(&event);
2597 2600
2598 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 2601 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2599 ret = 0; 2602 bio_put(bio);
2600 else { 2603 return -EIO;
2601 ret = -EIO;
2602 break;
2603 } 2604 }
2604 bio_put(bio); 2605 bio_put(bio);
2605 ee_len -= done; 2606 ee_len -= done;
2606 ee_pblock += done << (blkbits - 9); 2607 ee_pblock += done << (blkbits - 9);
2607 } 2608 }
2608 return ret; 2609 return 0;
2609} 2610}
2610 2611
2611#define EXT4_EXT_ZERO_LEN 7 2612#define EXT4_EXT_ZERO_LEN 7
2612/* 2613/*
2613 * This function is called by ext4_ext_get_blocks() if someone tries to write 2614 * This function is called by ext4_ext_map_blocks() if someone tries to write
2614 * to an uninitialized extent. It may result in splitting the uninitialized 2615 * to an uninitialized extent. It may result in splitting the uninitialized
2615 * extent into multiple extents (upto three - one initialized and two 2616 * extent into multiple extents (upto three - one initialized and two
2616 * uninitialized). 2617 * uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2620 * c> Splits in three extents: Somone is writing in middle of the extent 2621 * c> Splits in three extents: Somone is writing in middle of the extent
2621 */ 2622 */
2622static int ext4_ext_convert_to_initialized(handle_t *handle, 2623static int ext4_ext_convert_to_initialized(handle_t *handle,
2623 struct inode *inode, 2624 struct inode *inode,
2624 struct ext4_ext_path *path, 2625 struct ext4_map_blocks *map,
2625 ext4_lblk_t iblock, 2626 struct ext4_ext_path *path)
2626 unsigned int max_blocks)
2627{ 2627{
2628 struct ext4_extent *ex, newex, orig_ex; 2628 struct ext4_extent *ex, newex, orig_ex;
2629 struct ext4_extent *ex1 = NULL; 2629 struct ext4_extent *ex1 = NULL;
2630 struct ext4_extent *ex2 = NULL; 2630 struct ext4_extent *ex2 = NULL;
2631 struct ext4_extent *ex3 = NULL; 2631 struct ext4_extent *ex3 = NULL;
2632 struct ext4_extent_header *eh; 2632 struct ext4_extent_header *eh;
2633 ext4_lblk_t ee_block; 2633 ext4_lblk_t ee_block, eof_block;
2634 unsigned int allocated, ee_len, depth; 2634 unsigned int allocated, ee_len, depth;
2635 ext4_fsblk_t newblock; 2635 ext4_fsblk_t newblock;
2636 int err = 0; 2636 int err = 0;
2637 int ret = 0; 2637 int ret = 0;
2638 int may_zeroout;
2639
2640 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2641 "block %llu, max_blocks %u\n", inode->i_ino,
2642 (unsigned long long)map->m_lblk, map->m_len);
2643
2644 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2645 inode->i_sb->s_blocksize_bits;
2646 if (eof_block < map->m_lblk + map->m_len)
2647 eof_block = map->m_lblk + map->m_len;
2638 2648
2639 depth = ext_depth(inode); 2649 depth = ext_depth(inode);
2640 eh = path[depth].p_hdr; 2650 eh = path[depth].p_hdr;
2641 ex = path[depth].p_ext; 2651 ex = path[depth].p_ext;
2642 ee_block = le32_to_cpu(ex->ee_block); 2652 ee_block = le32_to_cpu(ex->ee_block);
2643 ee_len = ext4_ext_get_actual_len(ex); 2653 ee_len = ext4_ext_get_actual_len(ex);
2644 allocated = ee_len - (iblock - ee_block); 2654 allocated = ee_len - (map->m_lblk - ee_block);
2645 newblock = iblock - ee_block + ext_pblock(ex); 2655 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2656
2646 ex2 = ex; 2657 ex2 = ex;
2647 orig_ex.ee_block = ex->ee_block; 2658 orig_ex.ee_block = ex->ee_block;
2648 orig_ex.ee_len = cpu_to_le16(ee_len); 2659 orig_ex.ee_len = cpu_to_le16(ee_len);
2649 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2660 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2650 2661
2662 /*
2663 * It is safe to convert extent to initialized via explicit
2664 * zeroout only if extent is fully insde i_size or new_size.
2665 */
2666 may_zeroout = ee_block + ee_len <= eof_block;
2667
2651 err = ext4_ext_get_access(handle, inode, path + depth); 2668 err = ext4_ext_get_access(handle, inode, path + depth);
2652 if (err) 2669 if (err)
2653 goto out; 2670 goto out;
2654 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2671 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2655 if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { 2672 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2656 err = ext4_ext_zeroout(inode, &orig_ex); 2673 err = ext4_ext_zeroout(inode, &orig_ex);
2657 if (err) 2674 if (err)
2658 goto fix_extent_len; 2675 goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2665 return allocated; 2682 return allocated;
2666 } 2683 }
2667 2684
2668 /* ex1: ee_block to iblock - 1 : uninitialized */ 2685 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2669 if (iblock > ee_block) { 2686 if (map->m_lblk > ee_block) {
2670 ex1 = ex; 2687 ex1 = ex;
2671 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2688 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2672 ext4_ext_mark_uninitialized(ex1); 2689 ext4_ext_mark_uninitialized(ex1);
2673 ex2 = &newex; 2690 ex2 = &newex;
2674 } 2691 }
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2677 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2694 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2678 * overlap of blocks. 2695 * overlap of blocks.
2679 */ 2696 */
2680 if (!ex1 && allocated > max_blocks) 2697 if (!ex1 && allocated > map->m_len)
2681 ex2->ee_len = cpu_to_le16(max_blocks); 2698 ex2->ee_len = cpu_to_le16(map->m_len);
2682 /* ex3: to ee_block + ee_len : uninitialised */ 2699 /* ex3: to ee_block + ee_len : uninitialised */
2683 if (allocated > max_blocks) { 2700 if (allocated > map->m_len) {
2684 unsigned int newdepth; 2701 unsigned int newdepth;
2685 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2702 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2686 if (allocated <= EXT4_EXT_ZERO_LEN) { 2703 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2687 /* 2704 /*
2688 * iblock == ee_block is handled by the zerouout 2705 * map->m_lblk == ee_block is handled by the zerouout
2689 * at the beginning. 2706 * at the beginning.
2690 * Mark first half uninitialized. 2707 * Mark first half uninitialized.
2691 * Mark second half initialized and zero out the 2708 * Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2698 ext4_ext_dirty(handle, inode, path + depth); 2715 ext4_ext_dirty(handle, inode, path + depth);
2699 2716
2700 ex3 = &newex; 2717 ex3 = &newex;
2701 ex3->ee_block = cpu_to_le32(iblock); 2718 ex3->ee_block = cpu_to_le32(map->m_lblk);
2702 ext4_ext_store_pblock(ex3, newblock); 2719 ext4_ext_store_pblock(ex3, newblock);
2703 ex3->ee_len = cpu_to_le16(allocated); 2720 ex3->ee_len = cpu_to_le16(allocated);
2704 err = ext4_ext_insert_extent(handle, inode, path, 2721 err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2711 ex->ee_len = orig_ex.ee_len; 2728 ex->ee_len = orig_ex.ee_len;
2712 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2729 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2713 ext4_ext_dirty(handle, inode, path + depth); 2730 ext4_ext_dirty(handle, inode, path + depth);
2714 /* blocks available from iblock */ 2731 /* blocks available from map->m_lblk */
2715 return allocated; 2732 return allocated;
2716 2733
2717 } else if (err) 2734 } else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2733 */ 2750 */
2734 depth = ext_depth(inode); 2751 depth = ext_depth(inode);
2735 ext4_ext_drop_refs(path); 2752 ext4_ext_drop_refs(path);
2736 path = ext4_ext_find_extent(inode, 2753 path = ext4_ext_find_extent(inode, map->m_lblk,
2737 iblock, path); 2754 path);
2738 if (IS_ERR(path)) { 2755 if (IS_ERR(path)) {
2739 err = PTR_ERR(path); 2756 err = PTR_ERR(path);
2740 return err; 2757 return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2754 return allocated; 2771 return allocated;
2755 } 2772 }
2756 ex3 = &newex; 2773 ex3 = &newex;
2757 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 2774 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2758 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2775 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2759 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2776 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2760 ext4_ext_mark_uninitialized(ex3); 2777 ext4_ext_mark_uninitialized(ex3);
2761 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2778 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2762 if (err == -ENOSPC) { 2779 if (err == -ENOSPC && may_zeroout) {
2763 err = ext4_ext_zeroout(inode, &orig_ex); 2780 err = ext4_ext_zeroout(inode, &orig_ex);
2764 if (err) 2781 if (err)
2765 goto fix_extent_len; 2782 goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2769 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2786 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2770 ext4_ext_dirty(handle, inode, path + depth); 2787 ext4_ext_dirty(handle, inode, path + depth);
2771 /* zeroed the full extent */ 2788 /* zeroed the full extent */
2772 /* blocks available from iblock */ 2789 /* blocks available from map->m_lblk */
2773 return allocated; 2790 return allocated;
2774 2791
2775 } else if (err) 2792 } else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2783 * update the extent length after successful insert of the 2800 * update the extent length after successful insert of the
2784 * split extent 2801 * split extent
2785 */ 2802 */
2786 orig_ex.ee_len = cpu_to_le16(ee_len - 2803 ee_len -= ext4_ext_get_actual_len(ex3);
2787 ext4_ext_get_actual_len(ex3)); 2804 orig_ex.ee_len = cpu_to_le16(ee_len);
2805 may_zeroout = ee_block + ee_len <= eof_block;
2806
2788 depth = newdepth; 2807 depth = newdepth;
2789 ext4_ext_drop_refs(path); 2808 ext4_ext_drop_refs(path);
2790 path = ext4_ext_find_extent(inode, iblock, path); 2809 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2791 if (IS_ERR(path)) { 2810 if (IS_ERR(path)) {
2792 err = PTR_ERR(path); 2811 err = PTR_ERR(path);
2793 goto out; 2812 goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2801 if (err) 2820 if (err)
2802 goto out; 2821 goto out;
2803 2822
2804 allocated = max_blocks; 2823 allocated = map->m_len;
2805 2824
2806 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2825 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2807 * to insert a extent in the middle zerout directly 2826 * to insert a extent in the middle zerout directly
2808 * otherwise give the extent a chance to merge to left 2827 * otherwise give the extent a chance to merge to left
2809 */ 2828 */
2810 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2829 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2811 iblock != ee_block) { 2830 map->m_lblk != ee_block && may_zeroout) {
2812 err = ext4_ext_zeroout(inode, &orig_ex); 2831 err = ext4_ext_zeroout(inode, &orig_ex);
2813 if (err) 2832 if (err)
2814 goto fix_extent_len; 2833 goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2818 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2837 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2819 ext4_ext_dirty(handle, inode, path + depth); 2838 ext4_ext_dirty(handle, inode, path + depth);
2820 /* zero out the first half */ 2839 /* zero out the first half */
2821 /* blocks available from iblock */ 2840 /* blocks available from map->m_lblk */
2822 return allocated; 2841 return allocated;
2823 } 2842 }
2824 } 2843 }
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2829 */ 2848 */
2830 if (ex1 && ex1 != ex) { 2849 if (ex1 && ex1 != ex) {
2831 ex1 = ex; 2850 ex1 = ex;
2832 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2851 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2833 ext4_ext_mark_uninitialized(ex1); 2852 ext4_ext_mark_uninitialized(ex1);
2834 ex2 = &newex; 2853 ex2 = &newex;
2835 } 2854 }
2836 /* ex2: iblock to iblock + maxblocks-1 : initialised */ 2855 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2837 ex2->ee_block = cpu_to_le32(iblock); 2856 ex2->ee_block = cpu_to_le32(map->m_lblk);
2838 ext4_ext_store_pblock(ex2, newblock); 2857 ext4_ext_store_pblock(ex2, newblock);
2839 ex2->ee_len = cpu_to_le16(allocated); 2858 ex2->ee_len = cpu_to_le16(allocated);
2840 if (ex2 != ex) 2859 if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2877 goto out; 2896 goto out;
2878insert: 2897insert:
2879 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 2898 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2880 if (err == -ENOSPC) { 2899 if (err == -ENOSPC && may_zeroout) {
2881 err = ext4_ext_zeroout(inode, &orig_ex); 2900 err = ext4_ext_zeroout(inode, &orig_ex);
2882 if (err) 2901 if (err)
2883 goto fix_extent_len; 2902 goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
2904} 2923}
2905 2924
2906/* 2925/*
2907 * This function is called by ext4_ext_get_blocks() from 2926 * This function is called by ext4_ext_map_blocks() from
2908 * ext4_get_blocks_dio_write() when DIO to write 2927 * ext4_get_blocks_dio_write() when DIO to write
2909 * to an uninitialized extent. 2928 * to an uninitialized extent.
2910 * 2929 *
@@ -2927,9 +2946,8 @@ fix_extent_len:
2927 */ 2946 */
2928static int ext4_split_unwritten_extents(handle_t *handle, 2947static int ext4_split_unwritten_extents(handle_t *handle,
2929 struct inode *inode, 2948 struct inode *inode,
2949 struct ext4_map_blocks *map,
2930 struct ext4_ext_path *path, 2950 struct ext4_ext_path *path,
2931 ext4_lblk_t iblock,
2932 unsigned int max_blocks,
2933 int flags) 2951 int flags)
2934{ 2952{
2935 struct ext4_extent *ex, newex, orig_ex; 2953 struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2937 struct ext4_extent *ex2 = NULL; 2955 struct ext4_extent *ex2 = NULL;
2938 struct ext4_extent *ex3 = NULL; 2956 struct ext4_extent *ex3 = NULL;
2939 struct ext4_extent_header *eh; 2957 struct ext4_extent_header *eh;
2940 ext4_lblk_t ee_block; 2958 ext4_lblk_t ee_block, eof_block;
2941 unsigned int allocated, ee_len, depth; 2959 unsigned int allocated, ee_len, depth;
2942 ext4_fsblk_t newblock; 2960 ext4_fsblk_t newblock;
2943 int err = 0; 2961 int err = 0;
2962 int may_zeroout;
2963
2964 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2965 "block %llu, max_blocks %u\n", inode->i_ino,
2966 (unsigned long long)map->m_lblk, map->m_len);
2967
2968 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2969 inode->i_sb->s_blocksize_bits;
2970 if (eof_block < map->m_lblk + map->m_len)
2971 eof_block = map->m_lblk + map->m_len;
2944 2972
2945 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2946 "iblock %llu, max_blocks %u\n", inode->i_ino,
2947 (unsigned long long)iblock, max_blocks);
2948 depth = ext_depth(inode); 2973 depth = ext_depth(inode);
2949 eh = path[depth].p_hdr; 2974 eh = path[depth].p_hdr;
2950 ex = path[depth].p_ext; 2975 ex = path[depth].p_ext;
2951 ee_block = le32_to_cpu(ex->ee_block); 2976 ee_block = le32_to_cpu(ex->ee_block);
2952 ee_len = ext4_ext_get_actual_len(ex); 2977 ee_len = ext4_ext_get_actual_len(ex);
2953 allocated = ee_len - (iblock - ee_block); 2978 allocated = ee_len - (map->m_lblk - ee_block);
2954 newblock = iblock - ee_block + ext_pblock(ex); 2979 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2980
2955 ex2 = ex; 2981 ex2 = ex;
2956 orig_ex.ee_block = ex->ee_block; 2982 orig_ex.ee_block = ex->ee_block;
2957 orig_ex.ee_len = cpu_to_le16(ee_len); 2983 orig_ex.ee_len = cpu_to_le16(ee_len);
2958 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2984 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2959 2985
2960 /* 2986 /*
2987 * It is safe to convert extent to initialized via explicit
2988 * zeroout only if extent is fully insde i_size or new_size.
2989 */
2990 may_zeroout = ee_block + ee_len <= eof_block;
2991
2992 /*
2961 * If the uninitialized extent begins at the same logical 2993 * If the uninitialized extent begins at the same logical
2962 * block where the write begins, and the write completely 2994 * block where the write begins, and the write completely
2963 * covers the extent, then we don't need to split it. 2995 * covers the extent, then we don't need to split it.
2964 */ 2996 */
2965 if ((iblock == ee_block) && (allocated <= max_blocks)) 2997 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2966 return allocated; 2998 return allocated;
2967 2999
2968 err = ext4_ext_get_access(handle, inode, path + depth); 3000 err = ext4_ext_get_access(handle, inode, path + depth);
2969 if (err) 3001 if (err)
2970 goto out; 3002 goto out;
2971 /* ex1: ee_block to iblock - 1 : uninitialized */ 3003 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2972 if (iblock > ee_block) { 3004 if (map->m_lblk > ee_block) {
2973 ex1 = ex; 3005 ex1 = ex;
2974 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3006 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2975 ext4_ext_mark_uninitialized(ex1); 3007 ext4_ext_mark_uninitialized(ex1);
2976 ex2 = &newex; 3008 ex2 = &newex;
2977 } 3009 }
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2980 * we insert ex3, if ex1 is NULL. This is to avoid temporary 3012 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2981 * overlap of blocks. 3013 * overlap of blocks.
2982 */ 3014 */
2983 if (!ex1 && allocated > max_blocks) 3015 if (!ex1 && allocated > map->m_len)
2984 ex2->ee_len = cpu_to_le16(max_blocks); 3016 ex2->ee_len = cpu_to_le16(map->m_len);
2985 /* ex3: to ee_block + ee_len : uninitialised */ 3017 /* ex3: to ee_block + ee_len : uninitialised */
2986 if (allocated > max_blocks) { 3018 if (allocated > map->m_len) {
2987 unsigned int newdepth; 3019 unsigned int newdepth;
2988 ex3 = &newex; 3020 ex3 = &newex;
2989 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 3021 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2990 ext4_ext_store_pblock(ex3, newblock + max_blocks); 3022 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2991 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 3023 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2992 ext4_ext_mark_uninitialized(ex3); 3024 ext4_ext_mark_uninitialized(ex3);
2993 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 3025 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2994 if (err == -ENOSPC) { 3026 if (err == -ENOSPC && may_zeroout) {
2995 err = ext4_ext_zeroout(inode, &orig_ex); 3027 err = ext4_ext_zeroout(inode, &orig_ex);
2996 if (err) 3028 if (err)
2997 goto fix_extent_len; 3029 goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3001 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3033 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3002 ext4_ext_dirty(handle, inode, path + depth); 3034 ext4_ext_dirty(handle, inode, path + depth);
3003 /* zeroed the full extent */ 3035 /* zeroed the full extent */
3004 /* blocks available from iblock */ 3036 /* blocks available from map->m_lblk */
3005 return allocated; 3037 return allocated;
3006 3038
3007 } else if (err) 3039 } else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3015 * update the extent length after successful insert of the 3047 * update the extent length after successful insert of the
3016 * split extent 3048 * split extent
3017 */ 3049 */
3018 orig_ex.ee_len = cpu_to_le16(ee_len - 3050 ee_len -= ext4_ext_get_actual_len(ex3);
3019 ext4_ext_get_actual_len(ex3)); 3051 orig_ex.ee_len = cpu_to_le16(ee_len);
3052 may_zeroout = ee_block + ee_len <= eof_block;
3053
3020 depth = newdepth; 3054 depth = newdepth;
3021 ext4_ext_drop_refs(path); 3055 ext4_ext_drop_refs(path);
3022 path = ext4_ext_find_extent(inode, iblock, path); 3056 path = ext4_ext_find_extent(inode, map->m_lblk, path);
3023 if (IS_ERR(path)) { 3057 if (IS_ERR(path)) {
3024 err = PTR_ERR(path); 3058 err = PTR_ERR(path);
3025 goto out; 3059 goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3033 if (err) 3067 if (err)
3034 goto out; 3068 goto out;
3035 3069
3036 allocated = max_blocks; 3070 allocated = map->m_len;
3037 } 3071 }
3038 /* 3072 /*
3039 * If there was a change of depth as part of the 3073 * If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3042 */ 3076 */
3043 if (ex1 && ex1 != ex) { 3077 if (ex1 && ex1 != ex) {
3044 ex1 = ex; 3078 ex1 = ex;
3045 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3079 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3046 ext4_ext_mark_uninitialized(ex1); 3080 ext4_ext_mark_uninitialized(ex1);
3047 ex2 = &newex; 3081 ex2 = &newex;
3048 } 3082 }
3049 /* 3083 /*
3050 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, 3084 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3051 * uninitialised still. 3085 * using direct I/O, uninitialised still.
3052 */ 3086 */
3053 ex2->ee_block = cpu_to_le32(iblock); 3087 ex2->ee_block = cpu_to_le32(map->m_lblk);
3054 ext4_ext_store_pblock(ex2, newblock); 3088 ext4_ext_store_pblock(ex2, newblock);
3055 ex2->ee_len = cpu_to_le16(allocated); 3089 ex2->ee_len = cpu_to_le16(allocated);
3056 ext4_ext_mark_uninitialized(ex2); 3090 ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3062 goto out; 3096 goto out;
3063insert: 3097insert:
3064 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3098 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3065 if (err == -ENOSPC) { 3099 if (err == -ENOSPC && may_zeroout) {
3066 err = ext4_ext_zeroout(inode, &orig_ex); 3100 err = ext4_ext_zeroout(inode, &orig_ex);
3067 if (err) 3101 if (err)
3068 goto fix_extent_len; 3102 goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3152 3186
3153static int 3187static int
3154ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3188ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3155 ext4_lblk_t iblock, unsigned int max_blocks, 3189 struct ext4_map_blocks *map,
3156 struct ext4_ext_path *path, int flags, 3190 struct ext4_ext_path *path, int flags,
3157 unsigned int allocated, struct buffer_head *bh_result, 3191 unsigned int allocated, ext4_fsblk_t newblock)
3158 ext4_fsblk_t newblock)
3159{ 3192{
3160 int ret = 0; 3193 int ret = 0;
3161 int err = 0; 3194 int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3163 3196
3164 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3197 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3165 "block %llu, max_blocks %u, flags %d, allocated %u", 3198 "block %llu, max_blocks %u, flags %d, allocated %u",
3166 inode->i_ino, (unsigned long long)iblock, max_blocks, 3199 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3167 flags, allocated); 3200 flags, allocated);
3168 ext4_ext_show_leaf(inode, path); 3201 ext4_ext_show_leaf(inode, path);
3169 3202
3170 /* get_block() before submit the IO, split the extent */ 3203 /* get_block() before submit the IO, split the extent */
3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3204 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3172 ret = ext4_split_unwritten_extents(handle, 3205 ret = ext4_split_unwritten_extents(handle, inode, map,
3173 inode, path, iblock, 3206 path, flags);
3174 max_blocks, flags);
3175 /* 3207 /*
3176 * Flag the inode(non aio case) or end_io struct (aio case) 3208 * Flag the inode(non aio case) or end_io struct (aio case)
3177 * that this IO needs to convertion to written when IO is 3209 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3182 else 3214 else
3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3215 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode)) 3216 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result); 3217 map->m_flags |= EXT4_MAP_UNINIT;
3186 goto out; 3218 goto out;
3187 } 3219 }
3188 /* IO end_io complete, convert the filled extent to written */ 3220 /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3210 * the buffer head will be unmapped so that 3242 * the buffer head will be unmapped so that
3211 * a read from the block returns 0s. 3243 * a read from the block returns 0s.
3212 */ 3244 */
3213 set_buffer_unwritten(bh_result); 3245 map->m_flags |= EXT4_MAP_UNWRITTEN;
3214 goto out1; 3246 goto out1;
3215 } 3247 }
3216 3248
3217 /* buffered write, writepage time, convert*/ 3249 /* buffered write, writepage time, convert*/
3218 ret = ext4_ext_convert_to_initialized(handle, inode, 3250 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3219 path, iblock,
3220 max_blocks);
3221 if (ret >= 0) 3251 if (ret >= 0)
3222 ext4_update_inode_fsync_trans(handle, inode, 1); 3252 ext4_update_inode_fsync_trans(handle, inode, 1);
3223out: 3253out:
@@ -3226,7 +3256,7 @@ out:
3226 goto out2; 3256 goto out2;
3227 } else 3257 } else
3228 allocated = ret; 3258 allocated = ret;
3229 set_buffer_new(bh_result); 3259 map->m_flags |= EXT4_MAP_NEW;
3230 /* 3260 /*
3231 * if we allocated more blocks than requested 3261 * if we allocated more blocks than requested
3232 * we need to make sure we unmap the extra block 3262 * we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
3234 * unmapped later when we find the buffer_head marked 3264 * unmapped later when we find the buffer_head marked
3235 * new. 3265 * new.
3236 */ 3266 */
3237 if (allocated > max_blocks) { 3267 if (allocated > map->m_len) {
3238 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3268 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3239 newblock + max_blocks, 3269 newblock + map->m_len,
3240 allocated - max_blocks); 3270 allocated - map->m_len);
3241 allocated = max_blocks; 3271 allocated = map->m_len;
3242 } 3272 }
3243 3273
3244 /* 3274 /*
@@ -3252,13 +3282,13 @@ out:
3252 ext4_da_update_reserve_space(inode, allocated, 0); 3282 ext4_da_update_reserve_space(inode, allocated, 0);
3253 3283
3254map_out: 3284map_out:
3255 set_buffer_mapped(bh_result); 3285 map->m_flags |= EXT4_MAP_MAPPED;
3256out1: 3286out1:
3257 if (allocated > max_blocks) 3287 if (allocated > map->m_len)
3258 allocated = max_blocks; 3288 allocated = map->m_len;
3259 ext4_ext_show_leaf(inode, path); 3289 ext4_ext_show_leaf(inode, path);
3260 bh_result->b_bdev = inode->i_sb->s_bdev; 3290 map->m_pblk = newblock;
3261 bh_result->b_blocknr = newblock; 3291 map->m_len = allocated;
3262out2: 3292out2:
3263 if (path) { 3293 if (path) {
3264 ext4_ext_drop_refs(path); 3294 ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
3284 * 3314 *
3285 * return < 0, error case. 3315 * return < 0, error case.
3286 */ 3316 */
3287int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 3317int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3288 ext4_lblk_t iblock, 3318 struct ext4_map_blocks *map, int flags)
3289 unsigned int max_blocks, struct buffer_head *bh_result,
3290 int flags)
3291{ 3319{
3292 struct ext4_ext_path *path = NULL; 3320 struct ext4_ext_path *path = NULL;
3293 struct ext4_extent_header *eh; 3321 struct ext4_extent_header *eh;
3294 struct ext4_extent newex, *ex, *last_ex; 3322 struct ext4_extent newex, *ex, *last_ex;
3295 ext4_fsblk_t newblock; 3323 ext4_fsblk_t newblock;
3296 int err = 0, depth, ret, cache_type; 3324 int i, err = 0, depth, ret, cache_type;
3297 unsigned int allocated = 0; 3325 unsigned int allocated = 0;
3298 struct ext4_allocation_request ar; 3326 struct ext4_allocation_request ar;
3299 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3327 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3300 3328
3301 __clear_bit(BH_New, &bh_result->b_state);
3302 ext_debug("blocks %u/%u requested for inode %lu\n", 3329 ext_debug("blocks %u/%u requested for inode %lu\n",
3303 iblock, max_blocks, inode->i_ino); 3330 map->m_lblk, map->m_len, inode->i_ino);
3304 3331
3305 /* check in cache */ 3332 /* check in cache */
3306 cache_type = ext4_ext_in_cache(inode, iblock, &newex); 3333 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
3307 if (cache_type) { 3334 if (cache_type) {
3308 if (cache_type == EXT4_EXT_CACHE_GAP) { 3335 if (cache_type == EXT4_EXT_CACHE_GAP) {
3309 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3336 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3316 /* we should allocate requested block */ 3343 /* we should allocate requested block */
3317 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3344 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3318 /* block is already allocated */ 3345 /* block is already allocated */
3319 newblock = iblock 3346 newblock = map->m_lblk
3320 - le32_to_cpu(newex.ee_block) 3347 - le32_to_cpu(newex.ee_block)
3321 + ext_pblock(&newex); 3348 + ext_pblock(&newex);
3322 /* number of remaining blocks in the extent */ 3349 /* number of remaining blocks in the extent */
3323 allocated = ext4_ext_get_actual_len(&newex) - 3350 allocated = ext4_ext_get_actual_len(&newex) -
3324 (iblock - le32_to_cpu(newex.ee_block)); 3351 (map->m_lblk - le32_to_cpu(newex.ee_block));
3325 goto out; 3352 goto out;
3326 } else { 3353 } else {
3327 BUG(); 3354 BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3329 } 3356 }
3330 3357
3331 /* find extent for this block */ 3358 /* find extent for this block */
3332 path = ext4_ext_find_extent(inode, iblock, NULL); 3359 path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3333 if (IS_ERR(path)) { 3360 if (IS_ERR(path)) {
3334 err = PTR_ERR(path); 3361 err = PTR_ERR(path);
3335 path = NULL; 3362 path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3345 */ 3372 */
3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3373 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3347 EXT4_ERROR_INODE(inode, "bad extent address " 3374 EXT4_ERROR_INODE(inode, "bad extent address "
3348 "iblock: %d, depth: %d pblock %lld", 3375 "lblock: %lu, depth: %d pblock %lld",
3349 iblock, depth, path[depth].p_block); 3376 (unsigned long) map->m_lblk, depth,
3377 path[depth].p_block);
3350 err = -EIO; 3378 err = -EIO;
3351 goto out2; 3379 goto out2;
3352 } 3380 }
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3364 */ 3392 */
3365 ee_len = ext4_ext_get_actual_len(ex); 3393 ee_len = ext4_ext_get_actual_len(ex);
3366 /* if found extent covers block, simply return it */ 3394 /* if found extent covers block, simply return it */
3367 if (in_range(iblock, ee_block, ee_len)) { 3395 if (in_range(map->m_lblk, ee_block, ee_len)) {
3368 newblock = iblock - ee_block + ee_start; 3396 newblock = map->m_lblk - ee_block + ee_start;
3369 /* number of remaining blocks in the extent */ 3397 /* number of remaining blocks in the extent */
3370 allocated = ee_len - (iblock - ee_block); 3398 allocated = ee_len - (map->m_lblk - ee_block);
3371 ext_debug("%u fit into %u:%d -> %llu\n", iblock, 3399 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3372 ee_block, ee_len, newblock); 3400 ee_block, ee_len, newblock);
3373 3401
3374 /* Do not put uninitialized extent in the cache */ 3402 /* Do not put uninitialized extent in the cache */
3375 if (!ext4_ext_is_uninitialized(ex)) { 3403 if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3379 goto out; 3407 goto out;
3380 } 3408 }
3381 ret = ext4_ext_handle_uninitialized_extents(handle, 3409 ret = ext4_ext_handle_uninitialized_extents(handle,
3382 inode, iblock, max_blocks, path, 3410 inode, map, path, flags, allocated,
3383 flags, allocated, bh_result, newblock); 3411 newblock);
3384 return ret; 3412 return ret;
3385 } 3413 }
3386 } 3414 }
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3394 * put just found gap into cache to speed up 3422 * put just found gap into cache to speed up
3395 * subsequent requests 3423 * subsequent requests
3396 */ 3424 */
3397 ext4_ext_put_gap_in_cache(inode, path, iblock); 3425 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3398 goto out2; 3426 goto out2;
3399 } 3427 }
3400 /* 3428 /*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3402 */ 3430 */
3403 3431
3404 /* find neighbour allocated blocks */ 3432 /* find neighbour allocated blocks */
3405 ar.lleft = iblock; 3433 ar.lleft = map->m_lblk;
3406 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3434 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3407 if (err) 3435 if (err)
3408 goto out2; 3436 goto out2;
3409 ar.lright = iblock; 3437 ar.lright = map->m_lblk;
3410 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3438 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3411 if (err) 3439 if (err)
3412 goto out2; 3440 goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3417 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3445 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3418 * EXT_UNINIT_MAX_LEN. 3446 * EXT_UNINIT_MAX_LEN.
3419 */ 3447 */
3420 if (max_blocks > EXT_INIT_MAX_LEN && 3448 if (map->m_len > EXT_INIT_MAX_LEN &&
3421 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3449 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3422 max_blocks = EXT_INIT_MAX_LEN; 3450 map->m_len = EXT_INIT_MAX_LEN;
3423 else if (max_blocks > EXT_UNINIT_MAX_LEN && 3451 else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3424 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3452 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3425 max_blocks = EXT_UNINIT_MAX_LEN; 3453 map->m_len = EXT_UNINIT_MAX_LEN;
3426 3454
3427 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 3455 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3428 newex.ee_block = cpu_to_le32(iblock); 3456 newex.ee_block = cpu_to_le32(map->m_lblk);
3429 newex.ee_len = cpu_to_le16(max_blocks); 3457 newex.ee_len = cpu_to_le16(map->m_len);
3430 err = ext4_ext_check_overlap(inode, &newex, path); 3458 err = ext4_ext_check_overlap(inode, &newex, path);
3431 if (err) 3459 if (err)
3432 allocated = ext4_ext_get_actual_len(&newex); 3460 allocated = ext4_ext_get_actual_len(&newex);
3433 else 3461 else
3434 allocated = max_blocks; 3462 allocated = map->m_len;
3435 3463
3436 /* allocate new block */ 3464 /* allocate new block */
3437 ar.inode = inode; 3465 ar.inode = inode;
3438 ar.goal = ext4_ext_find_goal(inode, path, iblock); 3466 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3439 ar.logical = iblock; 3467 ar.logical = map->m_lblk;
3440 ar.len = allocated; 3468 ar.len = allocated;
3441 if (S_ISREG(inode->i_mode)) 3469 if (S_ISREG(inode->i_mode))
3442 ar.flags = EXT4_MB_HINT_DATA; 3470 ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3498 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3499 }
3472 if (ext4_should_dioread_nolock(inode)) 3500 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result); 3501 map->m_flags |= EXT4_MAP_UNINIT;
3474 } 3502 }
3475 3503
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { 3504 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
3477 if (unlikely(!eh->eh_entries)) { 3505 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode, 3506 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d", 3507 "eh->eh_entries == 0 and "
3480 ex->ee_block); 3508 "EOFBLOCKS_FL set");
3481 err = -EIO; 3509 err = -EIO;
3482 goto out2; 3510 goto out2;
3483 } 3511 }
3484 last_ex = EXT_LAST_EXTENT(eh); 3512 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) 3513 /*
3486 + ext4_ext_get_actual_len(last_ex)) 3514 * If the current leaf block was reached by looking at
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 3515 * the last index block all the way down the tree, and
3516 * we are extending the inode beyond the last extent
3517 * in the current leaf block, then clear the
3518 * EOFBLOCKS_FL flag.
3519 */
3520 for (i = depth-1; i >= 0; i--) {
3521 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3522 break;
3523 }
3524 if ((i < 0) &&
3525 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3526 ext4_ext_get_actual_len(last_ex)))
3527 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3488 } 3528 }
3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3529 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3490 if (err) { 3530 if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3500 /* previous routine could use block we allocated */ 3540 /* previous routine could use block we allocated */
3501 newblock = ext_pblock(&newex); 3541 newblock = ext_pblock(&newex);
3502 allocated = ext4_ext_get_actual_len(&newex); 3542 allocated = ext4_ext_get_actual_len(&newex);
3503 if (allocated > max_blocks) 3543 if (allocated > map->m_len)
3504 allocated = max_blocks; 3544 allocated = map->m_len;
3505 set_buffer_new(bh_result); 3545 map->m_flags |= EXT4_MAP_NEW;
3506 3546
3507 /* 3547 /*
3508 * Update reserved blocks/metadata blocks after successful 3548 * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3516 * when it is _not_ an uninitialized extent. 3556 * when it is _not_ an uninitialized extent.
3517 */ 3557 */
3518 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3558 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3519 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3559 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
3520 EXT4_EXT_CACHE_EXTENT); 3560 EXT4_EXT_CACHE_EXTENT);
3521 ext4_update_inode_fsync_trans(handle, inode, 1); 3561 ext4_update_inode_fsync_trans(handle, inode, 1);
3522 } else 3562 } else
3523 ext4_update_inode_fsync_trans(handle, inode, 0); 3563 ext4_update_inode_fsync_trans(handle, inode, 0);
3524out: 3564out:
3525 if (allocated > max_blocks) 3565 if (allocated > map->m_len)
3526 allocated = max_blocks; 3566 allocated = map->m_len;
3527 ext4_ext_show_leaf(inode, path); 3567 ext4_ext_show_leaf(inode, path);
3528 set_buffer_mapped(bh_result); 3568 map->m_flags |= EXT4_MAP_MAPPED;
3529 bh_result->b_bdev = inode->i_sb->s_bdev; 3569 map->m_pblk = newblock;
3530 bh_result->b_blocknr = newblock; 3570 map->m_len = allocated;
3531out2: 3571out2:
3532 if (path) { 3572 if (path) {
3533 ext4_ext_drop_refs(path); 3573 ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
3625 * can proceed even if the new size is the same as i_size. 3665 * can proceed even if the new size is the same as i_size.
3626 */ 3666 */
3627 if (new_size > i_size_read(inode)) 3667 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; 3668 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3629 } 3669 }
3630 3670
3631} 3671}
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
3640long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3680long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3641{ 3681{
3642 handle_t *handle; 3682 handle_t *handle;
3643 ext4_lblk_t block;
3644 loff_t new_size; 3683 loff_t new_size;
3645 unsigned int max_blocks; 3684 unsigned int max_blocks;
3646 int ret = 0; 3685 int ret = 0;
3647 int ret2 = 0; 3686 int ret2 = 0;
3648 int retries = 0; 3687 int retries = 0;
3649 struct buffer_head map_bh; 3688 struct ext4_map_blocks map;
3650 unsigned int credits, blkbits = inode->i_blkbits; 3689 unsigned int credits, blkbits = inode->i_blkbits;
3651 3690
3652 /* 3691 /*
3653 * currently supporting (pre)allocate mode for extent-based 3692 * currently supporting (pre)allocate mode for extent-based
3654 * files _only_ 3693 * files _only_
3655 */ 3694 */
3656 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3695 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3657 return -EOPNOTSUPP; 3696 return -EOPNOTSUPP;
3658 3697
3659 /* preallocation to directories is currently not supported */ 3698 /* preallocation to directories is currently not supported */
3660 if (S_ISDIR(inode->i_mode)) 3699 if (S_ISDIR(inode->i_mode))
3661 return -ENODEV; 3700 return -ENODEV;
3662 3701
3663 block = offset >> blkbits; 3702 map.m_lblk = offset >> blkbits;
3664 /* 3703 /*
3665 * We can't just convert len to max_blocks because 3704 * We can't just convert len to max_blocks because
3666 * If blocksize = 4096 offset = 3072 and len = 2048 3705 * If blocksize = 4096 offset = 3072 and len = 2048
3667 */ 3706 */
3668 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3707 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3669 - block; 3708 - map.m_lblk;
3670 /* 3709 /*
3671 * credits to insert 1 extent into extent tree 3710 * credits to insert 1 extent into extent tree
3672 */ 3711 */
3673 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3712 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3674 mutex_lock(&inode->i_mutex); 3713 mutex_lock(&inode->i_mutex);
3714 ret = inode_newsize_ok(inode, (len + offset));
3715 if (ret) {
3716 mutex_unlock(&inode->i_mutex);
3717 return ret;
3718 }
3675retry: 3719retry:
3676 while (ret >= 0 && ret < max_blocks) { 3720 while (ret >= 0 && ret < max_blocks) {
3677 block = block + ret; 3721 map.m_lblk = map.m_lblk + ret;
3678 max_blocks = max_blocks - ret; 3722 map.m_len = max_blocks = max_blocks - ret;
3679 handle = ext4_journal_start(inode, credits); 3723 handle = ext4_journal_start(inode, credits);
3680 if (IS_ERR(handle)) { 3724 if (IS_ERR(handle)) {
3681 ret = PTR_ERR(handle); 3725 ret = PTR_ERR(handle);
3682 break; 3726 break;
3683 } 3727 }
3684 map_bh.b_state = 0; 3728 ret = ext4_map_blocks(handle, inode, &map,
3685 ret = ext4_get_blocks(handle, inode, block,
3686 max_blocks, &map_bh,
3687 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3729 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3688 if (ret <= 0) { 3730 if (ret <= 0) {
3689#ifdef EXT4FS_DEBUG 3731#ifdef EXT4FS_DEBUG
3690 WARN_ON(ret <= 0); 3732 WARN_ON(ret <= 0);
3691 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3733 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3692 "returned error inode#%lu, block=%u, " 3734 "returned error inode#%lu, block=%u, "
3693 "max_blocks=%u", __func__, 3735 "max_blocks=%u", __func__,
3694 inode->i_ino, block, max_blocks); 3736 inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
3697 ret2 = ext4_journal_stop(handle); 3739 ret2 = ext4_journal_stop(handle);
3698 break; 3740 break;
3699 } 3741 }
3700 if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3742 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3701 blkbits) >> blkbits)) 3743 blkbits) >> blkbits))
3702 new_size = offset + len; 3744 new_size = offset + len;
3703 else 3745 else
3704 new_size = (block + ret) << blkbits; 3746 new_size = (map.m_lblk + ret) << blkbits;
3705 3747
3706 ext4_falloc_update_inode(inode, mode, new_size, 3748 ext4_falloc_update_inode(inode, mode, new_size,
3707 buffer_new(&map_bh)); 3749 (map.m_flags & EXT4_MAP_NEW));
3708 ext4_mark_inode_dirty(handle, inode); 3750 ext4_mark_inode_dirty(handle, inode);
3709 ret2 = ext4_journal_stop(handle); 3751 ret2 = ext4_journal_stop(handle);
3710 if (ret2) 3752 if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3733 ssize_t len) 3775 ssize_t len)
3734{ 3776{
3735 handle_t *handle; 3777 handle_t *handle;
3736 ext4_lblk_t block;
3737 unsigned int max_blocks; 3778 unsigned int max_blocks;
3738 int ret = 0; 3779 int ret = 0;
3739 int ret2 = 0; 3780 int ret2 = 0;
3740 struct buffer_head map_bh; 3781 struct ext4_map_blocks map;
3741 unsigned int credits, blkbits = inode->i_blkbits; 3782 unsigned int credits, blkbits = inode->i_blkbits;
3742 3783
3743 block = offset >> blkbits; 3784 map.m_lblk = offset >> blkbits;
3744 /* 3785 /*
3745 * We can't just convert len to max_blocks because 3786 * We can't just convert len to max_blocks because
3746 * If blocksize = 4096 offset = 3072 and len = 2048 3787 * If blocksize = 4096 offset = 3072 and len = 2048
3747 */ 3788 */
3748 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3789 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3749 - block; 3790 map.m_lblk);
3750 /* 3791 /*
3751 * credits to insert 1 extent into extent tree 3792 * credits to insert 1 extent into extent tree
3752 */ 3793 */
3753 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3794 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3754 while (ret >= 0 && ret < max_blocks) { 3795 while (ret >= 0 && ret < max_blocks) {
3755 block = block + ret; 3796 map.m_lblk += ret;
3756 max_blocks = max_blocks - ret; 3797 map.m_len = (max_blocks -= ret);
3757 handle = ext4_journal_start(inode, credits); 3798 handle = ext4_journal_start(inode, credits);
3758 if (IS_ERR(handle)) { 3799 if (IS_ERR(handle)) {
3759 ret = PTR_ERR(handle); 3800 ret = PTR_ERR(handle);
3760 break; 3801 break;
3761 } 3802 }
3762 map_bh.b_state = 0; 3803 ret = ext4_map_blocks(handle, inode, &map,
3763 ret = ext4_get_blocks(handle, inode, block,
3764 max_blocks, &map_bh,
3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3804 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3766 if (ret <= 0) { 3805 if (ret <= 0) {
3767 WARN_ON(ret <= 0); 3806 WARN_ON(ret <= 0);
3768 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3807 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3769 "returned error inode#%lu, block=%u, " 3808 "returned error inode#%lu, block=%u, "
3770 "max_blocks=%u", __func__, 3809 "max_blocks=%u", __func__,
3771 inode->i_ino, block, max_blocks); 3810 inode->i_ino, map.m_lblk, map.m_len);
3772 } 3811 }
3773 ext4_mark_inode_dirty(handle, inode); 3812 ext4_mark_inode_dirty(handle, inode);
3774 ret2 = ext4_journal_stop(handle); 3813 ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3898 int error = 0; 3937 int error = 0;
3899 3938
3900 /* fallback to generic here if not in extents fmt */ 3939 /* fallback to generic here if not in extents fmt */
3901 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3940 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3902 return generic_block_fiemap(inode, fieinfo, start, len, 3941 return generic_block_fiemap(inode, fieinfo, start, len,
3903 ext4_get_block); 3942 ext4_get_block);
3904 3943
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
66 * is smaller than s_maxbytes, which is for extent-mapped files. 66 * is smaller than s_maxbytes, which is for extent-mapped files.
67 */ 67 */
68 68
69 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 69 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
71 size_t length = iov_length(iov, nr_segs); 71 size_t length = iov_length(iov, nr_segs);
72 72
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ef3d980e67cb..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37/* 37/*
38 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since
40 * otherwise it will only be written by writeback, leaving a huge
41 * window during which a crash may lose the file. This may apply for
42 * the parent directory's parent as well, and so on recursively, if
43 * they are also freshly created.
44 */
45static void ext4_sync_parent(struct inode *inode)
46{
47 struct dentry *dentry = NULL;
48
49 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
50 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
51 dentry = list_entry(inode->i_dentry.next,
52 struct dentry, d_alias);
53 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
54 break;
55 inode = dentry->d_parent->d_inode;
56 sync_mapping_buffers(inode->i_mapping);
57 }
58}
59
60/*
38 * akpm: A new design for ext4_sync_file(). 61 * akpm: A new design for ext4_sync_file().
39 * 62 *
40 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). 63 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
48 * i_mutex lock is held when entering and exiting this function 71 * i_mutex lock is held when entering and exiting this function
49 */ 72 */
50 73
51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 74int ext4_sync_file(struct file *file, int datasync)
52{ 75{
53 struct inode *inode = dentry->d_inode; 76 struct inode *inode = file->f_mapping->host;
54 struct ext4_inode_info *ei = EXT4_I(inode); 77 struct ext4_inode_info *ei = EXT4_I(inode);
55 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 78 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
56 int ret; 79 int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
58 81
59 J_ASSERT(ext4_journal_current_handle() == NULL); 82 J_ASSERT(ext4_journal_current_handle() == NULL);
60 83
61 trace_ext4_sync_file(file, dentry, datasync); 84 trace_ext4_sync_file(file, datasync);
62 85
63 if (inode->i_sb->s_flags & MS_RDONLY) 86 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0; 87 return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
66 ret = flush_completed_IO(inode); 89 ret = flush_completed_IO(inode);
67 if (ret < 0) 90 if (ret < 0)
68 return ret; 91 return ret;
69 92
70 if (!journal) 93 if (!journal) {
71 return simple_fsync(file, dentry, datasync); 94 ret = generic_file_fsync(file, datasync);
95 if (!ret && !list_empty(&inode->i_dentry))
96 ext4_sync_parent(inode);
97 return ret;
98 }
72 99
73 /* 100 /*
74 * data=writeback,ordered: 101 * data=writeback,ordered:
@@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
102 (journal->j_flags & JBD2_BARRIER)) 129 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 NULL, BLKDEV_IFL_WAIT); 131 NULL, BLKDEV_IFL_WAIT);
105 jbd2_log_wait_commit(journal, commit_tid); 132 ret = jbd2_log_wait_commit(journal, commit_tid);
106 } else if (journal->j_flags & JBD2_BARRIER) 133 } else if (journal->j_flags & JBD2_BARRIER)
107 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
108 BLKDEV_IFL_WAIT); 135 BLKDEV_IFL_WAIT);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1a0e183a2f04..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
240 if (fatal) 240 if (fatal)
241 goto error_return; 241 goto error_return;
242 242
243 /* Ok, now we can actually update the inode bitmaps.. */ 243 fatal = -ESRCH;
244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 gdp = ext4_get_group_desc(sb, block_group, &bh2);
245 bit, bitmap_bh->b_data); 245 if (gdp) {
246 if (!cleared)
247 ext4_error(sb, "bit already cleared for inode %lu", ino);
248 else {
249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
250
251 BUFFER_TRACE(bh2, "get_write_access"); 246 BUFFER_TRACE(bh2, "get_write_access");
252 fatal = ext4_journal_get_write_access(handle, bh2); 247 fatal = ext4_journal_get_write_access(handle, bh2);
253 if (fatal) goto error_return; 248 }
254 249 ext4_lock_group(sb, block_group);
255 if (gdp) { 250 cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
256 ext4_lock_group(sb, block_group); 251 if (fatal || !cleared) {
257 count = ext4_free_inodes_count(sb, gdp) + 1; 252 ext4_unlock_group(sb, block_group);
258 ext4_free_inodes_set(sb, gdp, count); 253 goto out;
259 if (is_directory) { 254 }
260 count = ext4_used_dirs_count(sb, gdp) - 1;
261 ext4_used_dirs_set(sb, gdp, count);
262 if (sbi->s_log_groups_per_flex) {
263 ext4_group_t f;
264
265 f = ext4_flex_group(sbi, block_group);
266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
267 }
268 255
269 } 256 count = ext4_free_inodes_count(sb, gdp) + 1;
270 gdp->bg_checksum = ext4_group_desc_csum(sbi, 257 ext4_free_inodes_set(sb, gdp, count);
271 block_group, gdp); 258 if (is_directory) {
272 ext4_unlock_group(sb, block_group); 259 count = ext4_used_dirs_count(sb, gdp) - 1;
273 percpu_counter_inc(&sbi->s_freeinodes_counter); 260 ext4_used_dirs_set(sb, gdp, count);
274 if (is_directory) 261 percpu_counter_dec(&sbi->s_dirs_counter);
275 percpu_counter_dec(&sbi->s_dirs_counter);
276
277 if (sbi->s_log_groups_per_flex) {
278 ext4_group_t f;
279
280 f = ext4_flex_group(sbi, block_group);
281 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
282 }
283 }
284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
285 err = ext4_handle_dirty_metadata(handle, NULL, bh2);
286 if (!fatal) fatal = err;
287 } 262 }
288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); 263 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 264 ext4_unlock_group(sb, block_group);
290 if (!fatal) 265
291 fatal = err; 266 percpu_counter_inc(&sbi->s_freeinodes_counter);
292 sb->s_dirt = 1; 267 if (sbi->s_log_groups_per_flex) {
268 ext4_group_t f = ext4_flex_group(sbi, block_group);
269
270 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
271 if (is_directory)
272 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
273 }
274 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
275 fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
276out:
277 if (cleared) {
278 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
279 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
280 if (!fatal)
281 fatal = err;
282 sb->s_dirt = 1;
283 } else
284 ext4_error(sb, "bit already cleared for inode %lu", ino);
285
293error_return: 286error_return:
294 brelse(bitmap_bh); 287 brelse(bitmap_bh);
295 ext4_std_error(sb, fatal); 288 ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
499 492
500 if (S_ISDIR(mode) && 493 if (S_ISDIR(mode) &&
501 ((parent == sb->s_root->d_inode) || 494 ((parent == sb->s_root->d_inode) ||
502 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { 495 (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
503 int best_ndir = inodes_per_group; 496 int best_ndir = inodes_per_group;
504 int ret = -1; 497 int ret = -1;
505 498
@@ -1041,7 +1034,7 @@ got:
1041 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 1034 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1042 /* set extent flag only for directory, file and normal symlink*/ 1035 /* set extent flag only for directory, file and normal symlink*/
1043 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 1036 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
1044 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 1037 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
1045 ext4_ext_tree_init(handle, inode); 1038 ext4_ext_tree_init(handle, inode);
1046 } 1039 }
1047 } 1040 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e0f6af9d08d..42272d67955a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
149 int ret; 149 int ret;
150 150
151 /* 151 /*
152 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this 152 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
153 * moment, get_block can be called only for blocks inside i_size since 153 * moment, get_block can be called only for blocks inside i_size since
154 * page cache has been already dropped and writes are blocked by 154 * page cache has been already dropped and writes are blocked by
155 * i_mutex. So we can safely drop the i_data_sem here. 155 * i_mutex. So we can safely drop the i_data_sem here.
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
348 if (blk && 348 if (blk &&
349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
350 blk, 1))) { 350 blk, 1))) {
351 __ext4_error(inode->i_sb, function, 351 ext4_error_inode(function, inode,
352 "invalid block reference %u " 352 "invalid block reference %u", blk);
353 "in inode #%lu", blk, inode->i_ino);
354 return -EIO; 353 return -EIO;
355 } 354 }
356 } 355 }
@@ -785,7 +784,7 @@ failed:
785 /* Allocation failed, free what we already allocated */ 784 /* Allocation failed, free what we already allocated */
786 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 785 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
787 for (i = 1; i <= n ; i++) { 786 for (i = 1; i <= n ; i++) {
788 /* 787 /*
789 * branch[i].bh is newly allocated, so there is no 788 * branch[i].bh is newly allocated, so there is no
790 * need to revoke the block, which is why we don't 789 * need to revoke the block, which is why we don't
791 * need to set EXT4_FREE_BLOCKS_METADATA. 790 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
875 874
876err_out: 875err_out:
877 for (i = 1; i <= num; i++) { 876 for (i = 1; i <= num; i++) {
878 /* 877 /*
879 * branch[i].bh is newly allocated, so there is no 878 * branch[i].bh is newly allocated, so there is no
880 * need to revoke the block, which is why we don't 879 * need to revoke the block, which is why we don't
881 * need to set EXT4_FREE_BLOCKS_METADATA. 880 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +889,9 @@ err_out:
890} 889}
891 890
892/* 891/*
893 * The ext4_ind_get_blocks() function handles non-extents inodes 892 * The ext4_ind_map_blocks() function handles non-extents inodes
894 * (i.e., using the traditional indirect/double-indirect i_blocks 893 * (i.e., using the traditional indirect/double-indirect i_blocks
895 * scheme) for ext4_get_blocks(). 894 * scheme) for ext4_map_blocks().
896 * 895 *
897 * Allocation strategy is simple: if we have to allocate something, we will 896 * Allocation strategy is simple: if we have to allocate something, we will
898 * have to go the whole way to leaf. So let's do it before attaching anything 897 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +916,8 @@ err_out:
917 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 916 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
918 * blocks. 917 * blocks.
919 */ 918 */
920static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, 919static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
921 ext4_lblk_t iblock, unsigned int maxblocks, 920 struct ext4_map_blocks *map,
922 struct buffer_head *bh_result,
923 int flags) 921 int flags)
924{ 922{
925 int err = -EIO; 923 int err = -EIO;
@@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
933 int count = 0; 931 int count = 0;
934 ext4_fsblk_t first_block = 0; 932 ext4_fsblk_t first_block = 0;
935 933
936 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 934 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
937 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 935 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
938 depth = ext4_block_to_path(inode, iblock, offsets, 936 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
939 &blocks_to_boundary); 937 &blocks_to_boundary);
940 938
941 if (depth == 0) 939 if (depth == 0)
@@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
946 /* Simplest case - block found, no allocation needed */ 944 /* Simplest case - block found, no allocation needed */
947 if (!partial) { 945 if (!partial) {
948 first_block = le32_to_cpu(chain[depth - 1].key); 946 first_block = le32_to_cpu(chain[depth - 1].key);
949 clear_buffer_new(bh_result);
950 count++; 947 count++;
951 /*map more blocks*/ 948 /*map more blocks*/
952 while (count < maxblocks && count <= blocks_to_boundary) { 949 while (count < map->m_len && count <= blocks_to_boundary) {
953 ext4_fsblk_t blk; 950 ext4_fsblk_t blk;
954 951
955 blk = le32_to_cpu(*(chain[depth-1].p + count)); 952 blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
969 /* 966 /*
970 * Okay, we need to do block allocation. 967 * Okay, we need to do block allocation.
971 */ 968 */
972 goal = ext4_find_goal(inode, iblock, partial); 969 goal = ext4_find_goal(inode, map->m_lblk, partial);
973 970
974 /* the number of blocks need to allocate for [d,t]indirect blocks */ 971 /* the number of blocks need to allocate for [d,t]indirect blocks */
975 indirect_blks = (chain + depth) - partial - 1; 972 indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
979 * direct blocks to allocate for this branch. 976 * direct blocks to allocate for this branch.
980 */ 977 */
981 count = ext4_blks_to_allocate(partial, indirect_blks, 978 count = ext4_blks_to_allocate(partial, indirect_blks,
982 maxblocks, blocks_to_boundary); 979 map->m_len, blocks_to_boundary);
983 /* 980 /*
984 * Block out ext4_truncate while we alter the tree 981 * Block out ext4_truncate while we alter the tree
985 */ 982 */
986 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 983 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
987 &count, goal, 984 &count, goal,
988 offsets + (partial - chain), partial); 985 offsets + (partial - chain), partial);
989 986
@@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
995 * may need to return -EAGAIN upwards in the worst case. --sct 992 * may need to return -EAGAIN upwards in the worst case. --sct
996 */ 993 */
997 if (!err) 994 if (!err)
998 err = ext4_splice_branch(handle, inode, iblock, 995 err = ext4_splice_branch(handle, inode, map->m_lblk,
999 partial, indirect_blks, count); 996 partial, indirect_blks, count);
1000 if (err) 997 if (err)
1001 goto cleanup; 998 goto cleanup;
1002 999
1003 set_buffer_new(bh_result); 1000 map->m_flags |= EXT4_MAP_NEW;
1004 1001
1005 ext4_update_inode_fsync_trans(handle, inode, 1); 1002 ext4_update_inode_fsync_trans(handle, inode, 1);
1006got_it: 1003got_it:
1007 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1004 map->m_flags |= EXT4_MAP_MAPPED;
1005 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1006 map->m_len = count;
1008 if (count > blocks_to_boundary) 1007 if (count > blocks_to_boundary)
1009 set_buffer_boundary(bh_result); 1008 map->m_flags |= EXT4_MAP_BOUNDARY;
1010 err = count; 1009 err = count;
1011 /* Clean up and exit */ 1010 /* Clean up and exit */
1012 partial = chain + depth - 1; /* the whole chain */ 1011 partial = chain + depth - 1; /* the whole chain */
@@ -1016,7 +1015,6 @@ cleanup:
1016 brelse(partial->bh); 1015 brelse(partial->bh);
1017 partial--; 1016 partial--;
1018 } 1017 }
1019 BUFFER_TRACE(bh_result, "returned");
1020out: 1018out:
1021 return err; 1019 return err;
1022} 1020}
@@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1061 */ 1059 */
1062static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1060static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1063{ 1061{
1064 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1062 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1065 return ext4_ext_calc_metadata_amount(inode, lblock); 1063 return ext4_ext_calc_metadata_amount(inode, lblock);
1066 1064
1067 return ext4_indirect_calc_metadata_amount(inode, lblock); 1065 return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
1076{ 1074{
1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1075 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1078 struct ext4_inode_info *ei = EXT4_I(inode); 1076 struct ext4_inode_info *ei = EXT4_I(inode);
1079 int mdb_free = 0, allocated_meta_blocks = 0;
1080 1077
1081 spin_lock(&ei->i_block_reservation_lock); 1078 spin_lock(&ei->i_block_reservation_lock);
1082 trace_ext4_da_update_reserve_space(inode, used); 1079 trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
1091 1088
1092 /* Update per-inode reservations */ 1089 /* Update per-inode reservations */
1093 ei->i_reserved_data_blocks -= used; 1090 ei->i_reserved_data_blocks -= used;
1094 used += ei->i_allocated_meta_blocks;
1095 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 1091 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1096 allocated_meta_blocks = ei->i_allocated_meta_blocks; 1092 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1093 used + ei->i_allocated_meta_blocks);
1097 ei->i_allocated_meta_blocks = 0; 1094 ei->i_allocated_meta_blocks = 0;
1098 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1099 1095
1100 if (ei->i_reserved_data_blocks == 0) { 1096 if (ei->i_reserved_data_blocks == 0) {
1101 /* 1097 /*
@@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
1103 * only when we have written all of the delayed 1099 * only when we have written all of the delayed
1104 * allocation blocks. 1100 * allocation blocks.
1105 */ 1101 */
1106 mdb_free = ei->i_reserved_meta_blocks; 1102 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1103 ei->i_reserved_meta_blocks);
1107 ei->i_reserved_meta_blocks = 0; 1104 ei->i_reserved_meta_blocks = 0;
1108 ei->i_da_metadata_calc_len = 0; 1105 ei->i_da_metadata_calc_len = 0;
1109 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1110 } 1106 }
1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1107 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1112 1108
1113 /* Update quota subsystem */ 1109 /* Update quota subsystem for data blocks */
1114 if (quota_claim) { 1110 if (quota_claim)
1115 dquot_claim_block(inode, used); 1111 dquot_claim_block(inode, used);
1116 if (mdb_free) 1112 else {
1117 dquot_release_reservation_block(inode, mdb_free);
1118 } else {
1119 /* 1113 /*
1120 * We did fallocate with an offset that is already delayed 1114 * We did fallocate with an offset that is already delayed
1121 * allocated. So on delayed allocated writeback we should 1115 * allocated. So on delayed allocated writeback we should
1122 * not update the quota for allocated blocks. But then 1116 * not re-claim the quota for fallocated blocks.
1123 * converting an fallocate region to initialized region would
1124 * have caused a metadata allocation. So claim quota for
1125 * that
1126 */ 1117 */
1127 if (allocated_meta_blocks) 1118 dquot_release_reservation_block(inode, used);
1128 dquot_claim_block(inode, allocated_meta_blocks);
1129 dquot_release_reservation_block(inode, mdb_free + used);
1130 } 1119 }
1131 1120
1132 /* 1121 /*
@@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
1139 ext4_discard_preallocations(inode); 1128 ext4_discard_preallocations(inode);
1140} 1129}
1141 1130
1142static int check_block_validity(struct inode *inode, const char *msg, 1131static int check_block_validity(struct inode *inode, const char *func,
1143 sector_t logical, sector_t phys, int len) 1132 struct ext4_map_blocks *map)
1144{ 1133{
1145 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1134 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1146 __ext4_error(inode->i_sb, msg, 1135 map->m_len)) {
1147 "inode #%lu logical block %llu mapped to %llu " 1136 ext4_error_inode(func, inode,
1148 "(size %d)", inode->i_ino, 1137 "lblock %lu mapped to illegal pblock %llu "
1149 (unsigned long long) logical, 1138 "(length %d)", (unsigned long) map->m_lblk,
1150 (unsigned long long) phys, len); 1139 map->m_pblk, map->m_len);
1151 return -EIO; 1140 return -EIO;
1152 } 1141 }
1153 return 0; 1142 return 0;
@@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1212} 1201}
1213 1202
1214/* 1203/*
1215 * The ext4_get_blocks() function tries to look up the requested blocks, 1204 * The ext4_map_blocks() function tries to look up the requested blocks,
1216 * and returns if the blocks are already mapped. 1205 * and returns if the blocks are already mapped.
1217 * 1206 *
1218 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1207 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1219 * and store the allocated blocks in the result buffer head and mark it 1208 * and store the allocated blocks in the result buffer head and mark it
1220 * mapped. 1209 * mapped.
1221 * 1210 *
1222 * If file type is extents based, it will call ext4_ext_get_blocks(), 1211 * If file type is extents based, it will call ext4_ext_map_blocks(),
1223 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping 1212 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1224 * based files 1213 * based files
1225 * 1214 *
1226 * On success, it returns the number of blocks being mapped or allocate. 1215 * On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1233 * 1222 *
1234 * It returns the error in case of allocation failure. 1223 * It returns the error in case of allocation failure.
1235 */ 1224 */
1236int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, 1225int ext4_map_blocks(handle_t *handle, struct inode *inode,
1237 unsigned int max_blocks, struct buffer_head *bh, 1226 struct ext4_map_blocks *map, int flags)
1238 int flags)
1239{ 1227{
1240 int retval; 1228 int retval;
1241 1229
1242 clear_buffer_mapped(bh); 1230 map->m_flags = 0;
1243 clear_buffer_unwritten(bh); 1231 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1244 1232 "logical block %lu\n", inode->i_ino, flags, map->m_len,
1245 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," 1233 (unsigned long) map->m_lblk);
1246 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1247 (unsigned long)block);
1248 /* 1234 /*
1249 * Try to see if we can get the block without requesting a new 1235 * Try to see if we can get the block without requesting a new
1250 * file system block. 1236 * file system block.
1251 */ 1237 */
1252 down_read((&EXT4_I(inode)->i_data_sem)); 1238 down_read((&EXT4_I(inode)->i_data_sem));
1253 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1239 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1254 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1240 retval = ext4_ext_map_blocks(handle, inode, map, 0);
1255 bh, 0);
1256 } else { 1241 } else {
1257 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, 1242 retval = ext4_ind_map_blocks(handle, inode, map, 0);
1258 bh, 0);
1259 } 1243 }
1260 up_read((&EXT4_I(inode)->i_data_sem)); 1244 up_read((&EXT4_I(inode)->i_data_sem));
1261 1245
1262 if (retval > 0 && buffer_mapped(bh)) { 1246 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1263 int ret = check_block_validity(inode, "file system corruption", 1247 int ret = check_block_validity(inode, __func__, map);
1264 block, bh->b_blocknr, retval);
1265 if (ret != 0) 1248 if (ret != 0)
1266 return ret; 1249 return ret;
1267 } 1250 }
@@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1277 * ext4_ext_get_block() returns th create = 0 1260 * ext4_ext_get_block() returns th create = 0
1278 * with buffer head unmapped. 1261 * with buffer head unmapped.
1279 */ 1262 */
1280 if (retval > 0 && buffer_mapped(bh)) 1263 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1281 return retval; 1264 return retval;
1282 1265
1283 /* 1266 /*
@@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1290 * of BH_Unwritten and BH_Mapped flags being simultaneously 1273 * of BH_Unwritten and BH_Mapped flags being simultaneously
1291 * set on the buffer_head. 1274 * set on the buffer_head.
1292 */ 1275 */
1293 clear_buffer_unwritten(bh); 1276 map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1294 1277
1295 /* 1278 /*
1296 * New blocks allocate and/or writing to uninitialized extent 1279 * New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1312 * We need to check for EXT4 here because migrate 1295 * We need to check for EXT4 here because migrate
1313 * could have changed the inode type in between 1296 * could have changed the inode type in between
1314 */ 1297 */
1315 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1298 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1316 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1299 retval = ext4_ext_map_blocks(handle, inode, map, flags);
1317 bh, flags);
1318 } else { 1300 } else {
1319 retval = ext4_ind_get_blocks(handle, inode, block, 1301 retval = ext4_ind_map_blocks(handle, inode, map, flags);
1320 max_blocks, bh, flags);
1321 1302
1322 if (retval > 0 && buffer_new(bh)) { 1303 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1323 /* 1304 /*
1324 * We allocated new blocks which will result in 1305 * We allocated new blocks which will result in
1325 * i_data's format changing. Force the migrate 1306 * i_data's format changing. Force the migrate
@@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1342 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1323 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1343 1324
1344 up_write((&EXT4_I(inode)->i_data_sem)); 1325 up_write((&EXT4_I(inode)->i_data_sem));
1345 if (retval > 0 && buffer_mapped(bh)) { 1326 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1346 int ret = check_block_validity(inode, "file system " 1327 int ret = check_block_validity(inode,
1347 "corruption after allocation", 1328 "ext4_map_blocks_after_alloc",
1348 block, bh->b_blocknr, retval); 1329 map);
1349 if (ret != 0) 1330 if (ret != 0)
1350 return ret; 1331 return ret;
1351 } 1332 }
@@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1355/* Maximum number of blocks we map for direct IO at once. */ 1336/* Maximum number of blocks we map for direct IO at once. */
1356#define DIO_MAX_BLOCKS 4096 1337#define DIO_MAX_BLOCKS 4096
1357 1338
1358int ext4_get_block(struct inode *inode, sector_t iblock, 1339static int _ext4_get_block(struct inode *inode, sector_t iblock,
1359 struct buffer_head *bh_result, int create) 1340 struct buffer_head *bh, int flags)
1360{ 1341{
1361 handle_t *handle = ext4_journal_current_handle(); 1342 handle_t *handle = ext4_journal_current_handle();
1343 struct ext4_map_blocks map;
1362 int ret = 0, started = 0; 1344 int ret = 0, started = 0;
1363 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1364 int dio_credits; 1345 int dio_credits;
1365 1346
1366 if (create && !handle) { 1347 map.m_lblk = iblock;
1348 map.m_len = bh->b_size >> inode->i_blkbits;
1349
1350 if (flags && !handle) {
1367 /* Direct IO write... */ 1351 /* Direct IO write... */
1368 if (max_blocks > DIO_MAX_BLOCKS) 1352 if (map.m_len > DIO_MAX_BLOCKS)
1369 max_blocks = DIO_MAX_BLOCKS; 1353 map.m_len = DIO_MAX_BLOCKS;
1370 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1354 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1371 handle = ext4_journal_start(inode, dio_credits); 1355 handle = ext4_journal_start(inode, dio_credits);
1372 if (IS_ERR(handle)) { 1356 if (IS_ERR(handle)) {
1373 ret = PTR_ERR(handle); 1357 ret = PTR_ERR(handle);
1374 goto out; 1358 return ret;
1375 } 1359 }
1376 started = 1; 1360 started = 1;
1377 } 1361 }
1378 1362
1379 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 1363 ret = ext4_map_blocks(handle, inode, &map, flags);
1380 create ? EXT4_GET_BLOCKS_CREATE : 0);
1381 if (ret > 0) { 1364 if (ret > 0) {
1382 bh_result->b_size = (ret << inode->i_blkbits); 1365 map_bh(bh, inode->i_sb, map.m_pblk);
1366 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1367 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1383 ret = 0; 1368 ret = 0;
1384 } 1369 }
1385 if (started) 1370 if (started)
1386 ext4_journal_stop(handle); 1371 ext4_journal_stop(handle);
1387out:
1388 return ret; 1372 return ret;
1389} 1373}
1390 1374
1375int ext4_get_block(struct inode *inode, sector_t iblock,
1376 struct buffer_head *bh, int create)
1377{
1378 return _ext4_get_block(inode, iblock, bh,
1379 create ? EXT4_GET_BLOCKS_CREATE : 0);
1380}
1381
1391/* 1382/*
1392 * `handle' can be NULL if create is zero 1383 * `handle' can be NULL if create is zero
1393 */ 1384 */
1394struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1385struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1395 ext4_lblk_t block, int create, int *errp) 1386 ext4_lblk_t block, int create, int *errp)
1396{ 1387{
1397 struct buffer_head dummy; 1388 struct ext4_map_blocks map;
1389 struct buffer_head *bh;
1398 int fatal = 0, err; 1390 int fatal = 0, err;
1399 int flags = 0;
1400 1391
1401 J_ASSERT(handle != NULL || create == 0); 1392 J_ASSERT(handle != NULL || create == 0);
1402 1393
1403 dummy.b_state = 0; 1394 map.m_lblk = block;
1404 dummy.b_blocknr = -1000; 1395 map.m_len = 1;
1405 buffer_trace_init(&dummy.b_history); 1396 err = ext4_map_blocks(handle, inode, &map,
1406 if (create) 1397 create ? EXT4_GET_BLOCKS_CREATE : 0);
1407 flags |= EXT4_GET_BLOCKS_CREATE; 1398
1408 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); 1399 if (err < 0)
1409 /* 1400 *errp = err;
1410 * ext4_get_blocks() returns number of blocks mapped. 0 in 1401 if (err <= 0)
1411 * case of a HOLE. 1402 return NULL;
1412 */ 1403 *errp = 0;
1413 if (err > 0) { 1404
1414 if (err > 1) 1405 bh = sb_getblk(inode->i_sb, map.m_pblk);
1415 WARN_ON(1); 1406 if (!bh) {
1416 err = 0; 1407 *errp = -EIO;
1408 return NULL;
1417 } 1409 }
1418 *errp = err; 1410 if (map.m_flags & EXT4_MAP_NEW) {
1419 if (!err && buffer_mapped(&dummy)) { 1411 J_ASSERT(create != 0);
1420 struct buffer_head *bh; 1412 J_ASSERT(handle != NULL);
1421 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1422 if (!bh) {
1423 *errp = -EIO;
1424 goto err;
1425 }
1426 if (buffer_new(&dummy)) {
1427 J_ASSERT(create != 0);
1428 J_ASSERT(handle != NULL);
1429 1413
1430 /* 1414 /*
1431 * Now that we do not always journal data, we should 1415 * Now that we do not always journal data, we should
1432 * keep in mind whether this should always journal the 1416 * keep in mind whether this should always journal the
1433 * new buffer as metadata. For now, regular file 1417 * new buffer as metadata. For now, regular file
1434 * writes use ext4_get_block instead, so it's not a 1418 * writes use ext4_get_block instead, so it's not a
1435 * problem. 1419 * problem.
1436 */ 1420 */
1437 lock_buffer(bh); 1421 lock_buffer(bh);
1438 BUFFER_TRACE(bh, "call get_create_access"); 1422 BUFFER_TRACE(bh, "call get_create_access");
1439 fatal = ext4_journal_get_create_access(handle, bh); 1423 fatal = ext4_journal_get_create_access(handle, bh);
1440 if (!fatal && !buffer_uptodate(bh)) { 1424 if (!fatal && !buffer_uptodate(bh)) {
1441 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1425 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1442 set_buffer_uptodate(bh); 1426 set_buffer_uptodate(bh);
1443 }
1444 unlock_buffer(bh);
1445 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1446 err = ext4_handle_dirty_metadata(handle, inode, bh);
1447 if (!fatal)
1448 fatal = err;
1449 } else {
1450 BUFFER_TRACE(bh, "not a new buffer");
1451 }
1452 if (fatal) {
1453 *errp = fatal;
1454 brelse(bh);
1455 bh = NULL;
1456 } 1427 }
1457 return bh; 1428 unlock_buffer(bh);
1429 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1430 err = ext4_handle_dirty_metadata(handle, inode, bh);
1431 if (!fatal)
1432 fatal = err;
1433 } else {
1434 BUFFER_TRACE(bh, "not a new buffer");
1458 } 1435 }
1459err: 1436 if (fatal) {
1460 return NULL; 1437 *errp = fatal;
1438 brelse(bh);
1439 bh = NULL;
1440 }
1441 return bh;
1461} 1442}
1462 1443
1463struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1444struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1860 int retries = 0; 1841 int retries = 0;
1861 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1842 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1862 struct ext4_inode_info *ei = EXT4_I(inode); 1843 struct ext4_inode_info *ei = EXT4_I(inode);
1863 unsigned long md_needed, md_reserved; 1844 unsigned long md_needed;
1864 int ret; 1845 int ret;
1865 1846
1866 /* 1847 /*
@@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1870 */ 1851 */
1871repeat: 1852repeat:
1872 spin_lock(&ei->i_block_reservation_lock); 1853 spin_lock(&ei->i_block_reservation_lock);
1873 md_reserved = ei->i_reserved_meta_blocks;
1874 md_needed = ext4_calc_metadata_amount(inode, lblock); 1854 md_needed = ext4_calc_metadata_amount(inode, lblock);
1875 trace_ext4_da_reserve_space(inode, md_needed); 1855 trace_ext4_da_reserve_space(inode, md_needed);
1876 spin_unlock(&ei->i_block_reservation_lock); 1856 spin_unlock(&ei->i_block_reservation_lock);
1877 1857
1878 /* 1858 /*
1879 * Make quota reservation here to prevent quota overflow 1859 * We will charge metadata quota at writeout time; this saves
1880 * later. Real quota accounting is done at pages writeout 1860 * us from metadata over-estimation, though we may go over by
1881 * time. 1861 * a small amount in the end. Here we just reserve for data.
1882 */ 1862 */
1883 ret = dquot_reserve_block(inode, md_needed + 1); 1863 ret = dquot_reserve_block(inode, 1);
1884 if (ret) 1864 if (ret)
1885 return ret; 1865 return ret;
1886 1866 /*
1867 * We do still charge estimated metadata to the sb though;
1868 * we cannot afford to run out of free blocks.
1869 */
1887 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1870 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1888 dquot_release_reservation_block(inode, md_needed + 1); 1871 dquot_release_reservation_block(inode, 1);
1889 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1872 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1890 yield(); 1873 yield();
1891 goto repeat; 1874 goto repeat;
@@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1910 1893
1911 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1894 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1912 1895
1896 trace_ext4_da_release_space(inode, to_free);
1913 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1897 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1914 /* 1898 /*
1915 * if there aren't enough reserved blocks, then the 1899 * if there aren't enough reserved blocks, then the
@@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1932 * only when we have written all of the delayed 1916 * only when we have written all of the delayed
1933 * allocation blocks. 1917 * allocation blocks.
1934 */ 1918 */
1935 to_free += ei->i_reserved_meta_blocks; 1919 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1920 ei->i_reserved_meta_blocks);
1936 ei->i_reserved_meta_blocks = 0; 1921 ei->i_reserved_meta_blocks = 0;
1937 ei->i_da_metadata_calc_len = 0; 1922 ei->i_da_metadata_calc_len = 0;
1938 } 1923 }
1939 1924
1940 /* update fs dirty blocks counter */ 1925 /* update fs dirty data blocks counter */
1941 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1926 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1942 1927
1943 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1928 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2042/* 2027/*
2043 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2028 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
2044 * 2029 *
2045 * @mpd->inode - inode to walk through
2046 * @exbh->b_blocknr - first block on a disk
2047 * @exbh->b_size - amount of space in bytes
2048 * @logical - first logical block to start assignment with
2049 *
2050 * the function goes through all passed space and put actual disk 2030 * the function goes through all passed space and put actual disk
2051 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten 2031 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2052 */ 2032 */
2053static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 2033static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2054 struct buffer_head *exbh) 2034 struct ext4_map_blocks *map)
2055{ 2035{
2056 struct inode *inode = mpd->inode; 2036 struct inode *inode = mpd->inode;
2057 struct address_space *mapping = inode->i_mapping; 2037 struct address_space *mapping = inode->i_mapping;
2058 int blocks = exbh->b_size >> inode->i_blkbits; 2038 int blocks = map->m_len;
2059 sector_t pblock = exbh->b_blocknr, cur_logical; 2039 sector_t pblock = map->m_pblk, cur_logical;
2060 struct buffer_head *head, *bh; 2040 struct buffer_head *head, *bh;
2061 pgoff_t index, end; 2041 pgoff_t index, end;
2062 struct pagevec pvec; 2042 struct pagevec pvec;
2063 int nr_pages, i; 2043 int nr_pages, i;
2064 2044
2065 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2045 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2066 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2046 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2067 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2047 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2068 2048
2069 pagevec_init(&pvec, 0); 2049 pagevec_init(&pvec, 0);
@@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2090 2070
2091 /* skip blocks out of the range */ 2071 /* skip blocks out of the range */
2092 do { 2072 do {
2093 if (cur_logical >= logical) 2073 if (cur_logical >= map->m_lblk)
2094 break; 2074 break;
2095 cur_logical++; 2075 cur_logical++;
2096 } while ((bh = bh->b_this_page) != head); 2076 } while ((bh = bh->b_this_page) != head);
2097 2077
2098 do { 2078 do {
2099 if (cur_logical >= logical + blocks) 2079 if (cur_logical >= map->m_lblk + blocks)
2100 break; 2080 break;
2101 2081
2102 if (buffer_delay(bh) || 2082 if (buffer_delay(bh) || buffer_unwritten(bh)) {
2103 buffer_unwritten(bh)) {
2104 2083
2105 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); 2084 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2106 2085
@@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2119 } else if (buffer_mapped(bh)) 2098 } else if (buffer_mapped(bh))
2120 BUG_ON(bh->b_blocknr != pblock); 2099 BUG_ON(bh->b_blocknr != pblock);
2121 2100
2122 if (buffer_uninit(exbh)) 2101 if (map->m_flags & EXT4_MAP_UNINIT)
2123 set_buffer_uninit(bh); 2102 set_buffer_uninit(bh);
2124 cur_logical++; 2103 cur_logical++;
2125 pblock++; 2104 pblock++;
@@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2130} 2109}
2131 2110
2132 2111
2133/*
2134 * __unmap_underlying_blocks - just a helper function to unmap
2135 * set of blocks described by @bh
2136 */
2137static inline void __unmap_underlying_blocks(struct inode *inode,
2138 struct buffer_head *bh)
2139{
2140 struct block_device *bdev = inode->i_sb->s_bdev;
2141 int blocks, i;
2142
2143 blocks = bh->b_size >> inode->i_blkbits;
2144 for (i = 0; i < blocks; i++)
2145 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
2146}
2147
2148static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2112static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2149 sector_t logical, long blk_cnt) 2113 sector_t logical, long blk_cnt)
2150{ 2114{
@@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
2206static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2170static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2207{ 2171{
2208 int err, blks, get_blocks_flags; 2172 int err, blks, get_blocks_flags;
2209 struct buffer_head new; 2173 struct ext4_map_blocks map;
2210 sector_t next = mpd->b_blocknr; 2174 sector_t next = mpd->b_blocknr;
2211 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2175 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2212 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2176 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2247 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 2211 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2248 * variables are updated after the blocks have been allocated. 2212 * variables are updated after the blocks have been allocated.
2249 */ 2213 */
2250 new.b_state = 0; 2214 map.m_lblk = next;
2215 map.m_len = max_blocks;
2251 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2216 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2252 if (ext4_should_dioread_nolock(mpd->inode)) 2217 if (ext4_should_dioread_nolock(mpd->inode))
2253 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2218 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2254 if (mpd->b_state & (1 << BH_Delay)) 2219 if (mpd->b_state & (1 << BH_Delay))
2255 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2220 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2256 2221
2257 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2222 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2258 &new, get_blocks_flags);
2259 if (blks < 0) { 2223 if (blks < 0) {
2260 err = blks; 2224 err = blks;
2261 /* 2225 /*
@@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2282 ext4_msg(mpd->inode->i_sb, KERN_CRIT, 2246 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2283 "delayed block allocation failed for inode %lu at " 2247 "delayed block allocation failed for inode %lu at "
2284 "logical offset %llu with max blocks %zd with " 2248 "logical offset %llu with max blocks %zd with "
2285 "error %d\n", mpd->inode->i_ino, 2249 "error %d", mpd->inode->i_ino,
2286 (unsigned long long) next, 2250 (unsigned long long) next,
2287 mpd->b_size >> mpd->inode->i_blkbits, err); 2251 mpd->b_size >> mpd->inode->i_blkbits, err);
2288 printk(KERN_CRIT "This should not happen!! " 2252 printk(KERN_CRIT "This should not happen!! "
@@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2297 } 2261 }
2298 BUG_ON(blks == 0); 2262 BUG_ON(blks == 0);
2299 2263
2300 new.b_size = (blks << mpd->inode->i_blkbits); 2264 if (map.m_flags & EXT4_MAP_NEW) {
2265 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2266 int i;
2301 2267
2302 if (buffer_new(&new)) 2268 for (i = 0; i < map.m_len; i++)
2303 __unmap_underlying_blocks(mpd->inode, &new); 2269 unmap_underlying_metadata(bdev, map.m_pblk + i);
2270 }
2304 2271
2305 /* 2272 /*
2306 * If blocks are delayed marked, we need to 2273 * If blocks are delayed marked, we need to
@@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2308 */ 2275 */
2309 if ((mpd->b_state & (1 << BH_Delay)) || 2276 if ((mpd->b_state & (1 << BH_Delay)) ||
2310 (mpd->b_state & (1 << BH_Unwritten))) 2277 (mpd->b_state & (1 << BH_Unwritten)))
2311 mpage_put_bnr_to_bhs(mpd, next, &new); 2278 mpage_put_bnr_to_bhs(mpd, &map);
2312 2279
2313 if (ext4_should_order_data(mpd->inode)) { 2280 if (ext4_should_order_data(mpd->inode)) {
2314 err = ext4_jbd2_file_inode(handle, mpd->inode); 2281 err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2349 sector_t next; 2316 sector_t next;
2350 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2317 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2351 2318
2319 /*
2320 * XXX Don't go larger than mballoc is willing to allocate
2321 * This is a stopgap solution. We eventually need to fold
2322 * mpage_da_submit_io() into this function and then call
2323 * ext4_get_blocks() multiple times in a loop
2324 */
2325 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2326 goto flush_it;
2327
2352 /* check if thereserved journal credits might overflow */ 2328 /* check if thereserved journal credits might overflow */
2353 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2329 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2354 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2330 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2355 /* 2331 /*
2356 * With non-extent format we are limited by the journal 2332 * With non-extent format we are limited by the journal
@@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
2423 struct buffer_head *bh, *head; 2399 struct buffer_head *bh, *head;
2424 sector_t logical; 2400 sector_t logical;
2425 2401
2426 if (mpd->io_done) {
2427 /*
2428 * Rest of the page in the page_vec
2429 * redirty then and skip then. We will
2430 * try to write them again after
2431 * starting a new transaction
2432 */
2433 redirty_page_for_writepage(wbc, page);
2434 unlock_page(page);
2435 return MPAGE_DA_EXTENT_TAIL;
2436 }
2437 /* 2402 /*
2438 * Can we merge this page to current extent? 2403 * Can we merge this page to current extent?
2439 */ 2404 */
@@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
2528 * initialized properly. 2493 * initialized properly.
2529 */ 2494 */
2530static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2495static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2531 struct buffer_head *bh_result, int create) 2496 struct buffer_head *bh, int create)
2532{ 2497{
2498 struct ext4_map_blocks map;
2533 int ret = 0; 2499 int ret = 0;
2534 sector_t invalid_block = ~((sector_t) 0xffff); 2500 sector_t invalid_block = ~((sector_t) 0xffff);
2535 2501
@@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2537 invalid_block = ~0; 2503 invalid_block = ~0;
2538 2504
2539 BUG_ON(create == 0); 2505 BUG_ON(create == 0);
2540 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2506 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2507
2508 map.m_lblk = iblock;
2509 map.m_len = 1;
2541 2510
2542 /* 2511 /*
2543 * first, we need to know whether the block is allocated already 2512 * first, we need to know whether the block is allocated already
2544 * preallocated blocks are unmapped but should treated 2513 * preallocated blocks are unmapped but should treated
2545 * the same as allocated blocks. 2514 * the same as allocated blocks.
2546 */ 2515 */
2547 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); 2516 ret = ext4_map_blocks(NULL, inode, &map, 0);
2548 if ((ret == 0) && !buffer_delay(bh_result)) { 2517 if (ret < 0)
2549 /* the block isn't (pre)allocated yet, let's reserve space */ 2518 return ret;
2519 if (ret == 0) {
2520 if (buffer_delay(bh))
2521 return 0; /* Not sure this could or should happen */
2550 /* 2522 /*
2551 * XXX: __block_prepare_write() unmaps passed block, 2523 * XXX: __block_prepare_write() unmaps passed block,
2552 * is it OK? 2524 * is it OK?
@@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2556 /* not enough space to reserve */ 2528 /* not enough space to reserve */
2557 return ret; 2529 return ret;
2558 2530
2559 map_bh(bh_result, inode->i_sb, invalid_block); 2531 map_bh(bh, inode->i_sb, invalid_block);
2560 set_buffer_new(bh_result); 2532 set_buffer_new(bh);
2561 set_buffer_delay(bh_result); 2533 set_buffer_delay(bh);
2562 } else if (ret > 0) { 2534 return 0;
2563 bh_result->b_size = (ret << inode->i_blkbits);
2564 if (buffer_unwritten(bh_result)) {
2565 /* A delayed write to unwritten bh should
2566 * be marked new and mapped. Mapped ensures
2567 * that we don't do get_block multiple times
2568 * when we write to the same offset and new
2569 * ensures that we do proper zero out for
2570 * partial write.
2571 */
2572 set_buffer_new(bh_result);
2573 set_buffer_mapped(bh_result);
2574 }
2575 ret = 0;
2576 } 2535 }
2577 2536
2578 return ret; 2537 map_bh(bh, inode->i_sb, map.m_pblk);
2538 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2539
2540 if (buffer_unwritten(bh)) {
2541 /* A delayed write to unwritten bh should be marked
2542 * new and mapped. Mapped ensures that we don't do
2543 * get_block multiple times when we write to the same
2544 * offset and new ensures that we do proper zero out
2545 * for partial write.
2546 */
2547 set_buffer_new(bh);
2548 set_buffer_mapped(bh);
2549 }
2550 return 0;
2579} 2551}
2580 2552
2581/* 2553/*
@@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2597static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2569static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2598 struct buffer_head *bh_result, int create) 2570 struct buffer_head *bh_result, int create)
2599{ 2571{
2600 int ret = 0;
2601 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2602
2603 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2572 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2604 2573 return _ext4_get_block(inode, iblock, bh_result, 0);
2605 /*
2606 * we don't want to do block allocation in writepage
2607 * so call get_block_wrap with create = 0
2608 */
2609 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
2610 if (ret > 0) {
2611 bh_result->b_size = (ret << inode->i_blkbits);
2612 ret = 0;
2613 }
2614 return ret;
2615} 2574}
2616 2575
2617static int bget_one(handle_t *handle, struct buffer_head *bh) 2576static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2821 * number of contiguous block. So we will limit 2780 * number of contiguous block. So we will limit
2822 * number of contiguous block to a sane value 2781 * number of contiguous block to a sane value
2823 */ 2782 */
2824 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && 2783 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2825 (max_blocks > EXT4_MAX_TRANS_DATA)) 2784 (max_blocks > EXT4_MAX_TRANS_DATA))
2826 max_blocks = EXT4_MAX_TRANS_DATA; 2785 max_blocks = EXT4_MAX_TRANS_DATA;
2827 2786
2828 return ext4_chunk_trans_blocks(inode, max_blocks); 2787 return ext4_chunk_trans_blocks(inode, max_blocks);
2829} 2788}
2830 2789
2790/*
2791 * write_cache_pages_da - walk the list of dirty pages of the given
2792 * address space and call the callback function (which usually writes
2793 * the pages).
2794 *
2795 * This is a forked version of write_cache_pages(). Differences:
2796 * Range cyclic is ignored.
2797 * no_nrwrite_index_update is always presumed true
2798 */
2799static int write_cache_pages_da(struct address_space *mapping,
2800 struct writeback_control *wbc,
2801 struct mpage_da_data *mpd)
2802{
2803 int ret = 0;
2804 int done = 0;
2805 struct pagevec pvec;
2806 int nr_pages;
2807 pgoff_t index;
2808 pgoff_t end; /* Inclusive */
2809 long nr_to_write = wbc->nr_to_write;
2810
2811 pagevec_init(&pvec, 0);
2812 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2813 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2814
2815 while (!done && (index <= end)) {
2816 int i;
2817
2818 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2819 PAGECACHE_TAG_DIRTY,
2820 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2821 if (nr_pages == 0)
2822 break;
2823
2824 for (i = 0; i < nr_pages; i++) {
2825 struct page *page = pvec.pages[i];
2826
2827 /*
2828 * At this point, the page may be truncated or
2829 * invalidated (changing page->mapping to NULL), or
2830 * even swizzled back from swapper_space to tmpfs file
2831 * mapping. However, page->index will not change
2832 * because we have a reference on the page.
2833 */
2834 if (page->index > end) {
2835 done = 1;
2836 break;
2837 }
2838
2839 lock_page(page);
2840
2841 /*
2842 * Page truncated or invalidated. We can freely skip it
2843 * then, even for data integrity operations: the page
2844 * has disappeared concurrently, so there could be no
2845 * real expectation of this data interity operation
2846 * even if there is now a new, dirty page at the same
2847 * pagecache address.
2848 */
2849 if (unlikely(page->mapping != mapping)) {
2850continue_unlock:
2851 unlock_page(page);
2852 continue;
2853 }
2854
2855 if (!PageDirty(page)) {
2856 /* someone wrote it for us */
2857 goto continue_unlock;
2858 }
2859
2860 if (PageWriteback(page)) {
2861 if (wbc->sync_mode != WB_SYNC_NONE)
2862 wait_on_page_writeback(page);
2863 else
2864 goto continue_unlock;
2865 }
2866
2867 BUG_ON(PageWriteback(page));
2868 if (!clear_page_dirty_for_io(page))
2869 goto continue_unlock;
2870
2871 ret = __mpage_da_writepage(page, wbc, mpd);
2872 if (unlikely(ret)) {
2873 if (ret == AOP_WRITEPAGE_ACTIVATE) {
2874 unlock_page(page);
2875 ret = 0;
2876 } else {
2877 done = 1;
2878 break;
2879 }
2880 }
2881
2882 if (nr_to_write > 0) {
2883 nr_to_write--;
2884 if (nr_to_write == 0 &&
2885 wbc->sync_mode == WB_SYNC_NONE) {
2886 /*
2887 * We stop writing back only if we are
2888 * not doing integrity sync. In case of
2889 * integrity sync we have to keep going
2890 * because someone may be concurrently
2891 * dirtying pages, and we might have
2892 * synced a lot of newly appeared dirty
2893 * pages, but have not synced all of the
2894 * old dirty pages.
2895 */
2896 done = 1;
2897 break;
2898 }
2899 }
2900 }
2901 pagevec_release(&pvec);
2902 cond_resched();
2903 }
2904 return ret;
2905}
2906
2907
2831static int ext4_da_writepages(struct address_space *mapping, 2908static int ext4_da_writepages(struct address_space *mapping,
2832 struct writeback_control *wbc) 2909 struct writeback_control *wbc)
2833{ 2910{
@@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2836 handle_t *handle = NULL; 2913 handle_t *handle = NULL;
2837 struct mpage_da_data mpd; 2914 struct mpage_da_data mpd;
2838 struct inode *inode = mapping->host; 2915 struct inode *inode = mapping->host;
2839 int no_nrwrite_index_update;
2840 int pages_written = 0; 2916 int pages_written = 0;
2841 long pages_skipped; 2917 long pages_skipped;
2842 unsigned int max_pages; 2918 unsigned int max_pages;
@@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2916 mpd.wbc = wbc; 2992 mpd.wbc = wbc;
2917 mpd.inode = mapping->host; 2993 mpd.inode = mapping->host;
2918 2994
2919 /*
2920 * we don't want write_cache_pages to update
2921 * nr_to_write and writeback_index
2922 */
2923 no_nrwrite_index_update = wbc->no_nrwrite_index_update;
2924 wbc->no_nrwrite_index_update = 1;
2925 pages_skipped = wbc->pages_skipped; 2995 pages_skipped = wbc->pages_skipped;
2926 2996
2927retry: 2997retry:
@@ -2941,7 +3011,7 @@ retry:
2941 if (IS_ERR(handle)) { 3011 if (IS_ERR(handle)) {
2942 ret = PTR_ERR(handle); 3012 ret = PTR_ERR(handle);
2943 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 3013 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2944 "%ld pages, ino %lu; err %d\n", __func__, 3014 "%ld pages, ino %lu; err %d", __func__,
2945 wbc->nr_to_write, inode->i_ino, ret); 3015 wbc->nr_to_write, inode->i_ino, ret);
2946 goto out_writepages; 3016 goto out_writepages;
2947 } 3017 }
@@ -2963,8 +3033,7 @@ retry:
2963 mpd.io_done = 0; 3033 mpd.io_done = 0;
2964 mpd.pages_written = 0; 3034 mpd.pages_written = 0;
2965 mpd.retval = 0; 3035 mpd.retval = 0;
2966 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 3036 ret = write_cache_pages_da(mapping, wbc, &mpd);
2967 &mpd);
2968 /* 3037 /*
2969 * If we have a contiguous extent of pages and we 3038 * If we have a contiguous extent of pages and we
2970 * haven't done the I/O yet, map the blocks and submit 3039 * haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3085,7 @@ retry:
3016 if (pages_skipped != wbc->pages_skipped) 3085 if (pages_skipped != wbc->pages_skipped)
3017 ext4_msg(inode->i_sb, KERN_CRIT, 3086 ext4_msg(inode->i_sb, KERN_CRIT,
3018 "This should not happen leaving %s " 3087 "This should not happen leaving %s "
3019 "with nr_to_write = %ld ret = %d\n", 3088 "with nr_to_write = %ld ret = %d",
3020 __func__, wbc->nr_to_write, ret); 3089 __func__, wbc->nr_to_write, ret);
3021 3090
3022 /* Update index */ 3091 /* Update index */
@@ -3030,8 +3099,6 @@ retry:
3030 mapping->writeback_index = index; 3099 mapping->writeback_index = index;
3031 3100
3032out_writepages: 3101out_writepages:
3033 if (!no_nrwrite_index_update)
3034 wbc->no_nrwrite_index_update = 0;
3035 wbc->nr_to_write -= nr_to_writebump; 3102 wbc->nr_to_write -= nr_to_writebump;
3036 wbc->range_start = range_start; 3103 wbc->range_start = range_start;
3037 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3104 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3076 loff_t pos, unsigned len, unsigned flags, 3143 loff_t pos, unsigned len, unsigned flags,
3077 struct page **pagep, void **fsdata) 3144 struct page **pagep, void **fsdata)
3078{ 3145{
3079 int ret, retries = 0, quota_retries = 0; 3146 int ret, retries = 0;
3080 struct page *page; 3147 struct page *page;
3081 pgoff_t index; 3148 pgoff_t index;
3082 unsigned from, to; 3149 unsigned from, to;
@@ -3135,22 +3202,6 @@ retry:
3135 3202
3136 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3203 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3137 goto retry; 3204 goto retry;
3138
3139 if ((ret == -EDQUOT) &&
3140 EXT4_I(inode)->i_reserved_meta_blocks &&
3141 (quota_retries++ < 3)) {
3142 /*
3143 * Since we often over-estimate the number of meta
3144 * data blocks required, we may sometimes get a
3145 * spurios out of quota error even though there would
3146 * be enough space once we write the data blocks and
3147 * find out how many meta data blocks were _really_
3148 * required. So try forcing the inode write to see if
3149 * that helps.
3150 */
3151 write_inode_now(inode, (quota_retries == 3));
3152 goto retry;
3153 }
3154out: 3205out:
3155 return ret; 3206 return ret;
3156} 3207}
@@ -3546,46 +3597,18 @@ out:
3546 return ret; 3597 return ret;
3547} 3598}
3548 3599
3600/*
3601 * ext4_get_block used when preparing for a DIO write or buffer write.
3602 * We allocate an uinitialized extent if blocks haven't been allocated.
3603 * The extent will be converted to initialized after the IO is complete.
3604 */
3549static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3605static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3550 struct buffer_head *bh_result, int create) 3606 struct buffer_head *bh_result, int create)
3551{ 3607{
3552 handle_t *handle = ext4_journal_current_handle();
3553 int ret = 0;
3554 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3555 int dio_credits;
3556 int started = 0;
3557
3558 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3608 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3559 inode->i_ino, create); 3609 inode->i_ino, create);
3560 /* 3610 return _ext4_get_block(inode, iblock, bh_result,
3561 * ext4_get_block in prepare for a DIO write or buffer write. 3611 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3562 * We allocate an uinitialized extent if blocks haven't been allocated.
3563 * The extent will be converted to initialized after IO complete.
3564 */
3565 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3566
3567 if (!handle) {
3568 if (max_blocks > DIO_MAX_BLOCKS)
3569 max_blocks = DIO_MAX_BLOCKS;
3570 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3571 handle = ext4_journal_start(inode, dio_credits);
3572 if (IS_ERR(handle)) {
3573 ret = PTR_ERR(handle);
3574 goto out;
3575 }
3576 started = 1;
3577 }
3578
3579 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3580 create);
3581 if (ret > 0) {
3582 bh_result->b_size = (ret << inode->i_blkbits);
3583 ret = 0;
3584 }
3585 if (started)
3586 ext4_journal_stop(handle);
3587out:
3588 return ret;
3589} 3612}
3590 3613
3591static void dump_completed_IO(struct inode * inode) 3614static void dump_completed_IO(struct inode * inode)
@@ -3973,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3973 struct file *file = iocb->ki_filp; 3996 struct file *file = iocb->ki_filp;
3974 struct inode *inode = file->f_mapping->host; 3997 struct inode *inode = file->f_mapping->host;
3975 3998
3976 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 3999 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3977 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 4000 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3978 4001
3979 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 4002 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4302,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4302 4325
4303 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 4326 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4304 count)) { 4327 count)) {
4305 ext4_error(inode->i_sb, "inode #%lu: " 4328 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4306 "attempt to clear blocks %llu len %lu, invalid", 4329 "blocks %llu len %lu",
4307 inode->i_ino, (unsigned long long) block_to_free, 4330 (unsigned long long) block_to_free, count);
4308 count);
4309 return 1; 4331 return 1;
4310 } 4332 }
4311 4333
@@ -4410,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4410 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4432 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4411 ext4_handle_dirty_metadata(handle, inode, this_bh); 4433 ext4_handle_dirty_metadata(handle, inode, this_bh);
4412 else 4434 else
4413 ext4_error(inode->i_sb, 4435 EXT4_ERROR_INODE(inode,
4414 "circular indirect block detected, " 4436 "circular indirect block detected at "
4415 "inode=%lu, block=%llu", 4437 "block %llu",
4416 inode->i_ino, 4438 (unsigned long long) this_bh->b_blocknr);
4417 (unsigned long long) this_bh->b_blocknr);
4418 } 4439 }
4419} 4440}
4420 4441
@@ -4452,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4452 4473
4453 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 4474 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4454 nr, 1)) { 4475 nr, 1)) {
4455 ext4_error(inode->i_sb, 4476 EXT4_ERROR_INODE(inode,
4456 "indirect mapped block in inode " 4477 "invalid indirect mapped "
4457 "#%lu invalid (level %d, blk #%lu)", 4478 "block %lu (level %d)",
4458 inode->i_ino, depth, 4479 (unsigned long) nr, depth);
4459 (unsigned long) nr);
4460 break; 4480 break;
4461 } 4481 }
4462 4482
@@ -4468,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4468 * (should be rare). 4488 * (should be rare).
4469 */ 4489 */
4470 if (!bh) { 4490 if (!bh) {
4471 ext4_error(inode->i_sb, 4491 EXT4_ERROR_INODE(inode,
4472 "Read failure, inode=%lu, block=%llu", 4492 "Read failure block=%llu",
4473 inode->i_ino, nr); 4493 (unsigned long long) nr);
4474 continue; 4494 continue;
4475 } 4495 }
4476 4496
@@ -4612,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
4612 if (!ext4_can_truncate(inode)) 4632 if (!ext4_can_truncate(inode))
4613 return; 4633 return;
4614 4634
4615 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 4635 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4616 4636
4617 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4637 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4618 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4638 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4619 4639
4620 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4640 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4621 ext4_ext_truncate(inode); 4641 ext4_ext_truncate(inode);
4622 return; 4642 return;
4623 } 4643 }
@@ -4785,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4785 4805
4786 bh = sb_getblk(sb, block); 4806 bh = sb_getblk(sb, block);
4787 if (!bh) { 4807 if (!bh) {
4788 ext4_error(sb, "unable to read inode block - " 4808 EXT4_ERROR_INODE(inode, "unable to read inode block - "
4789 "inode=%lu, block=%llu", inode->i_ino, block); 4809 "block %llu", block);
4790 return -EIO; 4810 return -EIO;
4791 } 4811 }
4792 if (!buffer_uptodate(bh)) { 4812 if (!buffer_uptodate(bh)) {
@@ -4884,8 +4904,8 @@ make_io:
4884 submit_bh(READ_META, bh); 4904 submit_bh(READ_META, bh);
4885 wait_on_buffer(bh); 4905 wait_on_buffer(bh);
4886 if (!buffer_uptodate(bh)) { 4906 if (!buffer_uptodate(bh)) {
4887 ext4_error(sb, "unable to read inode block - inode=%lu," 4907 EXT4_ERROR_INODE(inode, "unable to read inode "
4888 " block=%llu", inode->i_ino, block); 4908 "block %llu", block);
4889 brelse(bh); 4909 brelse(bh);
4890 return -EIO; 4910 return -EIO;
4891 } 4911 }
@@ -4922,20 +4942,26 @@ void ext4_set_inode_flags(struct inode *inode)
4922/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4942/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4923void ext4_get_inode_flags(struct ext4_inode_info *ei) 4943void ext4_get_inode_flags(struct ext4_inode_info *ei)
4924{ 4944{
4925 unsigned int flags = ei->vfs_inode.i_flags; 4945 unsigned int vfs_fl;
4926 4946 unsigned long old_fl, new_fl;
4927 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4947
4928 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 4948 do {
4929 if (flags & S_SYNC) 4949 vfs_fl = ei->vfs_inode.i_flags;
4930 ei->i_flags |= EXT4_SYNC_FL; 4950 old_fl = ei->i_flags;
4931 if (flags & S_APPEND) 4951 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4932 ei->i_flags |= EXT4_APPEND_FL; 4952 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
4933 if (flags & S_IMMUTABLE) 4953 EXT4_DIRSYNC_FL);
4934 ei->i_flags |= EXT4_IMMUTABLE_FL; 4954 if (vfs_fl & S_SYNC)
4935 if (flags & S_NOATIME) 4955 new_fl |= EXT4_SYNC_FL;
4936 ei->i_flags |= EXT4_NOATIME_FL; 4956 if (vfs_fl & S_APPEND)
4937 if (flags & S_DIRSYNC) 4957 new_fl |= EXT4_APPEND_FL;
4938 ei->i_flags |= EXT4_DIRSYNC_FL; 4958 if (vfs_fl & S_IMMUTABLE)
4959 new_fl |= EXT4_IMMUTABLE_FL;
4960 if (vfs_fl & S_NOATIME)
4961 new_fl |= EXT4_NOATIME_FL;
4962 if (vfs_fl & S_DIRSYNC)
4963 new_fl |= EXT4_DIRSYNC_FL;
4964 } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
4939} 4965}
4940 4966
4941static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4967static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5096,8 +5122,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5096 ret = 0; 5122 ret = 0;
5097 if (ei->i_file_acl && 5123 if (ei->i_file_acl &&
5098 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5124 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
5099 ext4_error(sb, "bad extended attribute block %llu inode #%lu", 5125 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
5100 ei->i_file_acl, inode->i_ino); 5126 ei->i_file_acl);
5101 ret = -EIO; 5127 ret = -EIO;
5102 goto bad_inode; 5128 goto bad_inode;
5103 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 5129 } else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5142,8 +5168,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5142 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5168 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5143 } else { 5169 } else {
5144 ret = -EIO; 5170 ret = -EIO;
5145 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", 5171 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5146 inode->i_mode, inode->i_ino);
5147 goto bad_inode; 5172 goto bad_inode;
5148 } 5173 }
5149 brelse(iloc.bh); 5174 brelse(iloc.bh);
@@ -5172,7 +5197,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
5172 */ 5197 */
5173 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5198 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5174 raw_inode->i_blocks_high = 0; 5199 raw_inode->i_blocks_high = 0;
5175 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 5200 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5176 return 0; 5201 return 0;
5177 } 5202 }
5178 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 5203 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -5185,9 +5210,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
5185 */ 5210 */
5186 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5211 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5187 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 5212 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5188 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 5213 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5189 } else { 5214 } else {
5190 ei->i_flags |= EXT4_HUGE_FILE_FL; 5215 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5191 /* i_block is stored in file system block size */ 5216 /* i_block is stored in file system block size */
5192 i_blocks = i_blocks >> (inode->i_blkbits - 9); 5217 i_blocks = i_blocks >> (inode->i_blkbits - 9);
5193 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5218 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -5381,9 +5406,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5381 if (wbc->sync_mode == WB_SYNC_ALL) 5406 if (wbc->sync_mode == WB_SYNC_ALL)
5382 sync_dirty_buffer(iloc.bh); 5407 sync_dirty_buffer(iloc.bh);
5383 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5408 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5384 ext4_error(inode->i_sb, "IO error syncing inode, " 5409 EXT4_ERROR_INODE(inode,
5385 "inode=%lu, block=%llu", inode->i_ino, 5410 "IO error syncing inode (block=%llu)",
5386 (unsigned long long)iloc.bh->b_blocknr); 5411 (unsigned long long) iloc.bh->b_blocknr);
5387 err = -EIO; 5412 err = -EIO;
5388 } 5413 }
5389 brelse(iloc.bh); 5414 brelse(iloc.bh);
@@ -5455,7 +5480,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5455 } 5480 }
5456 5481
5457 if (attr->ia_valid & ATTR_SIZE) { 5482 if (attr->ia_valid & ATTR_SIZE) {
5458 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 5483 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5459 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5484 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5460 5485
5461 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5486 if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5468,7 +5493,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5468 if (S_ISREG(inode->i_mode) && 5493 if (S_ISREG(inode->i_mode) &&
5469 attr->ia_valid & ATTR_SIZE && 5494 attr->ia_valid & ATTR_SIZE &&
5470 (attr->ia_size < inode->i_size || 5495 (attr->ia_size < inode->i_size ||
5471 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { 5496 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5472 handle_t *handle; 5497 handle_t *handle;
5473 5498
5474 handle = ext4_journal_start(inode, 3); 5499 handle = ext4_journal_start(inode, 3);
@@ -5500,7 +5525,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5500 } 5525 }
5501 } 5526 }
5502 /* ext4_truncate will clear the flag */ 5527 /* ext4_truncate will clear the flag */
5503 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) 5528 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5504 ext4_truncate(inode); 5529 ext4_truncate(inode);
5505 } 5530 }
5506 5531
@@ -5576,7 +5601,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5576 5601
5577static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5602static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5578{ 5603{
5579 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 5604 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5580 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 5605 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5581 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 5606 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5582} 5607}
@@ -5911,9 +5936,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5911 */ 5936 */
5912 5937
5913 if (val) 5938 if (val)
5914 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5939 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5915 else 5940 else
5916 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5941 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5917 ext4_set_aops(inode); 5942 ext4_set_aops(inode);
5918 5943
5919 jbd2_journal_unlock_updates(journal); 5944 jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
258 if (me.moved_len > 0) 258 if (me.moved_len > 0)
259 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
260 260
261 if (copy_to_user((struct move_extent __user *)arg, 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me))) 262 &me, sizeof(me)))
263 err = -EFAULT; 263 err = -EFAULT;
264mext_out: 264mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
373 case EXT4_IOC32_SETRSVSZ: 373 case EXT4_IOC32_SETRSVSZ:
374 cmd = EXT4_IOC_SETRSVSZ; 374 cmd = EXT4_IOC_SETRSVSZ;
375 break; 375 break;
376 case EXT4_IOC_GROUP_ADD: 376 case EXT4_IOC32_GROUP_ADD: {
377 struct compat_ext4_new_group_input __user *uinput;
378 struct ext4_new_group_input input;
379 mm_segment_t old_fs;
380 int err;
381
382 uinput = compat_ptr(arg);
383 err = get_user(input.group, &uinput->group);
384 err |= get_user(input.block_bitmap, &uinput->block_bitmap);
385 err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
386 err |= get_user(input.inode_table, &uinput->inode_table);
387 err |= get_user(input.blocks_count, &uinput->blocks_count);
388 err |= get_user(input.reserved_blocks,
389 &uinput->reserved_blocks);
390 if (err)
391 return -EFAULT;
392 old_fs = get_fs();
393 set_fs(KERNEL_DS);
394 err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
395 (unsigned long) &input);
396 set_fs(old_fs);
397 return err;
398 }
399 case EXT4_IOC_MOVE_EXT:
377 break; 400 break;
378 default: 401 default:
379 return -ENOIOCTLCMD; 402 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
658 } 658 }
659} 659}
660 660
661/*
662 * Cache the order of the largest free extent we have available in this block
663 * group.
664 */
665static void
666mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
667{
668 int i;
669 int bits;
670
671 grp->bb_largest_free_order = -1; /* uninit */
672
673 bits = sb->s_blocksize_bits + 1;
674 for (i = bits; i >= 0; i--) {
675 if (grp->bb_counters[i] > 0) {
676 grp->bb_largest_free_order = i;
677 break;
678 }
679 }
680}
681
661static noinline_for_stack 682static noinline_for_stack
662void ext4_mb_generate_buddy(struct super_block *sb, 683void ext4_mb_generate_buddy(struct super_block *sb,
663 void *buddy, void *bitmap, ext4_group_t group) 684 void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
700 */ 721 */
701 grp->bb_free = free; 722 grp->bb_free = free;
702 } 723 }
724 mb_set_largest_free_order(sb, grp);
703 725
704 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 726 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
705 727
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
725 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 747 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
726 * So it can have information regarding groups_per_page which 748 * So it can have information regarding groups_per_page which
727 * is blocks_per_page/2 749 * is blocks_per_page/2
750 *
751 * Locking note: This routine takes the block group lock of all groups
752 * for this page; do not hold this lock when calling this routine!
728 */ 753 */
729 754
730static int ext4_mb_init_cache(struct page *page, char *incore) 755static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
865 BUG_ON(incore == NULL); 890 BUG_ON(incore == NULL);
866 mb_debug(1, "put buddy for group %u in page %lu/%x\n", 891 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
867 group, page->index, i * blocksize); 892 group, page->index, i * blocksize);
893 trace_ext4_mb_buddy_bitmap_load(sb, group);
868 grinfo = ext4_get_group_info(sb, group); 894 grinfo = ext4_get_group_info(sb, group);
869 grinfo->bb_fragments = 0; 895 grinfo->bb_fragments = 0;
870 memset(grinfo->bb_counters, 0, 896 memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
882 BUG_ON(incore != NULL); 908 BUG_ON(incore != NULL);
883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n", 909 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
884 group, page->index, i * blocksize); 910 group, page->index, i * blocksize);
911 trace_ext4_mb_bitmap_load(sb, group);
885 912
886 /* see comments in ext4_mb_put_pa() */ 913 /* see comments in ext4_mb_put_pa() */
887 ext4_lock_group(sb, group); 914 ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
910 return err; 937 return err;
911} 938}
912 939
940/*
941 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
942 * block group lock of all groups for this page; do not hold the BG lock when
943 * calling this routine!
944 */
913static noinline_for_stack 945static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 946int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{ 947{
@@ -1004,6 +1036,11 @@ err:
1004 return ret; 1036 return ret;
1005} 1037}
1006 1038
1039/*
1040 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1041 * block group lock of all groups for this page; do not hold the BG lock when
1042 * calling this routine!
1043 */
1007static noinline_for_stack int 1044static noinline_for_stack int
1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1045ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1009 struct ext4_buddy *e4b) 1046 struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
1150 return ret; 1187 return ret;
1151} 1188}
1152 1189
1153static void ext4_mb_release_desc(struct ext4_buddy *e4b) 1190static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1154{ 1191{
1155 if (e4b->bd_bitmap_page) 1192 if (e4b->bd_bitmap_page)
1156 page_cache_release(e4b->bd_bitmap_page); 1193 page_cache_release(e4b->bd_bitmap_page);
@@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1299 buddy = buddy2; 1336 buddy = buddy2;
1300 } while (1); 1337 } while (1);
1301 } 1338 }
1339 mb_set_largest_free_order(sb, e4b->bd_info);
1302 mb_check_buddy(e4b); 1340 mb_check_buddy(e4b);
1303} 1341}
1304 1342
@@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1427 e4b->bd_info->bb_counters[ord]++; 1465 e4b->bd_info->bb_counters[ord]++;
1428 e4b->bd_info->bb_counters[ord]++; 1466 e4b->bd_info->bb_counters[ord]++;
1429 } 1467 }
1468 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1430 1469
1431 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1470 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1432 mb_check_buddy(e4b); 1471 mb_check_buddy(e4b);
@@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1617 } 1656 }
1618 1657
1619 ext4_unlock_group(ac->ac_sb, group); 1658 ext4_unlock_group(ac->ac_sb, group);
1620 ext4_mb_release_desc(e4b); 1659 ext4_mb_unload_buddy(e4b);
1621 1660
1622 return 0; 1661 return 0;
1623} 1662}
@@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1672 ext4_mb_use_best_found(ac, e4b); 1711 ext4_mb_use_best_found(ac, e4b);
1673 } 1712 }
1674 ext4_unlock_group(ac->ac_sb, group); 1713 ext4_unlock_group(ac->ac_sb, group);
1675 ext4_mb_release_desc(e4b); 1714 ext4_mb_unload_buddy(e4b);
1676 1715
1677 return 0; 1716 return 0;
1678} 1717}
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1821 } 1860 }
1822} 1861}
1823 1862
1863/* This is now called BEFORE we load the buddy bitmap. */
1824static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1864static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1825 ext4_group_t group, int cr) 1865 ext4_group_t group, int cr)
1826{ 1866{
1827 unsigned free, fragments; 1867 unsigned free, fragments;
1828 unsigned i, bits;
1829 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1868 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1830 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1869 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1831 1870
1832 BUG_ON(cr < 0 || cr >= 4); 1871 BUG_ON(cr < 0 || cr >= 4);
1833 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1872
1873 /* We only do this if the grp has never been initialized */
1874 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1875 int ret = ext4_mb_init_group(ac->ac_sb, group);
1876 if (ret)
1877 return 0;
1878 }
1834 1879
1835 free = grp->bb_free; 1880 free = grp->bb_free;
1836 fragments = grp->bb_fragments; 1881 fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1843 case 0: 1888 case 0:
1844 BUG_ON(ac->ac_2order == 0); 1889 BUG_ON(ac->ac_2order == 0);
1845 1890
1891 if (grp->bb_largest_free_order < ac->ac_2order)
1892 return 0;
1893
1846 /* Avoid using the first bg of a flexgroup for data files */ 1894 /* Avoid using the first bg of a flexgroup for data files */
1847 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1895 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1848 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1896 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1849 ((group % flex_size) == 0)) 1897 ((group % flex_size) == 0))
1850 return 0; 1898 return 0;
1851 1899
1852 bits = ac->ac_sb->s_blocksize_bits + 1; 1900 return 1;
1853 for (i = ac->ac_2order; i <= bits; i++)
1854 if (grp->bb_counters[i] > 0)
1855 return 1;
1856 break;
1857 case 1: 1901 case 1:
1858 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1902 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1859 return 1; 1903 return 1;
@@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1964 sbi = EXT4_SB(sb); 2008 sbi = EXT4_SB(sb);
1965 ngroups = ext4_get_groups_count(sb); 2009 ngroups = ext4_get_groups_count(sb);
1966 /* non-extent files are limited to low blocks/groups */ 2010 /* non-extent files are limited to low blocks/groups */
1967 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) 2011 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
1968 ngroups = sbi->s_blockfile_groups; 2012 ngroups = sbi->s_blockfile_groups;
1969 2013
1970 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2014 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2024,15 +2068,11 @@ repeat:
2024 group = ac->ac_g_ex.fe_group; 2068 group = ac->ac_g_ex.fe_group;
2025 2069
2026 for (i = 0; i < ngroups; group++, i++) { 2070 for (i = 0; i < ngroups; group++, i++) {
2027 struct ext4_group_info *grp;
2028 struct ext4_group_desc *desc;
2029
2030 if (group == ngroups) 2071 if (group == ngroups)
2031 group = 0; 2072 group = 0;
2032 2073
2033 /* quick check to skip empty groups */ 2074 /* This now checks without needing the buddy page */
2034 grp = ext4_get_group_info(sb, group); 2075 if (!ext4_mb_good_group(ac, group, cr))
2035 if (grp->bb_free == 0)
2036 continue; 2076 continue;
2037 2077
2038 err = ext4_mb_load_buddy(sb, group, &e4b); 2078 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,15 +2080,18 @@ repeat:
2040 goto out; 2080 goto out;
2041 2081
2042 ext4_lock_group(sb, group); 2082 ext4_lock_group(sb, group);
2083
2084 /*
2085 * We need to check again after locking the
2086 * block group
2087 */
2043 if (!ext4_mb_good_group(ac, group, cr)) { 2088 if (!ext4_mb_good_group(ac, group, cr)) {
2044 /* someone did allocation from this group */
2045 ext4_unlock_group(sb, group); 2089 ext4_unlock_group(sb, group);
2046 ext4_mb_release_desc(&e4b); 2090 ext4_mb_unload_buddy(&e4b);
2047 continue; 2091 continue;
2048 } 2092 }
2049 2093
2050 ac->ac_groups_scanned++; 2094 ac->ac_groups_scanned++;
2051 desc = ext4_get_group_desc(sb, group, NULL);
2052 if (cr == 0) 2095 if (cr == 0)
2053 ext4_mb_simple_scan_group(ac, &e4b); 2096 ext4_mb_simple_scan_group(ac, &e4b);
2054 else if (cr == 1 && 2097 else if (cr == 1 &&
@@ -2058,7 +2101,7 @@ repeat:
2058 ext4_mb_complex_scan_group(ac, &e4b); 2101 ext4_mb_complex_scan_group(ac, &e4b);
2059 2102
2060 ext4_unlock_group(sb, group); 2103 ext4_unlock_group(sb, group);
2061 ext4_mb_release_desc(&e4b); 2104 ext4_mb_unload_buddy(&e4b);
2062 2105
2063 if (ac->ac_status != AC_STATUS_CONTINUE) 2106 if (ac->ac_status != AC_STATUS_CONTINUE)
2064 break; 2107 break;
@@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2148 ext4_lock_group(sb, group); 2191 ext4_lock_group(sb, group);
2149 memcpy(&sg, ext4_get_group_info(sb, group), i); 2192 memcpy(&sg, ext4_get_group_info(sb, group), i);
2150 ext4_unlock_group(sb, group); 2193 ext4_unlock_group(sb, group);
2151 ext4_mb_release_desc(&e4b); 2194 ext4_mb_unload_buddy(&e4b);
2152 2195
2153 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2196 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2154 sg.info.bb_fragments, sg.info.bb_first_free); 2197 sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2255 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2298 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2256 init_rwsem(&meta_group_info[i]->alloc_sem); 2299 init_rwsem(&meta_group_info[i]->alloc_sem);
2257 meta_group_info[i]->bb_free_root = RB_ROOT; 2300 meta_group_info[i]->bb_free_root = RB_ROOT;
2301 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
2258 2302
2259#ifdef DOUBLE_CHECK 2303#ifdef DOUBLE_CHECK
2260 { 2304 {
@@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2536 entry->count, entry->group, entry); 2580 entry->count, entry->group, entry);
2537 2581
2538 if (test_opt(sb, DISCARD)) { 2582 if (test_opt(sb, DISCARD)) {
2583 int ret;
2539 ext4_fsblk_t discard_block; 2584 ext4_fsblk_t discard_block;
2540 2585
2541 discard_block = entry->start_blk + 2586 discard_block = entry->start_blk +
@@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2543 trace_ext4_discard_blocks(sb, 2588 trace_ext4_discard_blocks(sb,
2544 (unsigned long long)discard_block, 2589 (unsigned long long)discard_block,
2545 entry->count); 2590 entry->count);
2546 sb_issue_discard(sb, discard_block, entry->count); 2591 ret = sb_issue_discard(sb, discard_block, entry->count);
2592 if (ret == EOPNOTSUPP) {
2593 ext4_warning(sb,
2594 "discard not supported, disabling");
2595 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2596 }
2547 } 2597 }
2548 2598
2549 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2568 } 2618 }
2569 ext4_unlock_group(sb, entry->group); 2619 ext4_unlock_group(sb, entry->group);
2570 kmem_cache_free(ext4_free_ext_cachep, entry); 2620 kmem_cache_free(ext4_free_ext_cachep, entry);
2571 ext4_mb_release_desc(&e4b); 2621 ext4_mb_unload_buddy(&e4b);
2572 } 2622 }
2573 2623
2574 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2624 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
2641 2691
2642void exit_ext4_mballoc(void) 2692void exit_ext4_mballoc(void)
2643{ 2693{
2644 /* 2694 /*
2645 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2646 * before destroying the slab cache. 2696 * before destroying the slab cache.
2647 */ 2697 */
@@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
2981 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3031 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
2982 atomic_inc(&sbi->s_bal_reqs); 3032 atomic_inc(&sbi->s_bal_reqs);
2983 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3033 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
2984 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) 3034 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
2985 atomic_inc(&sbi->s_bal_success); 3035 atomic_inc(&sbi->s_bal_success);
2986 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3036 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
2987 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3037 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3123 continue; 3173 continue;
3124 3174
3125 /* non-extent files can't have physical blocks past 2^32 */ 3175 /* non-extent files can't have physical blocks past 2^32 */
3126 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && 3176 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3127 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) 3177 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3128 continue; 3178 continue;
3129 3179
@@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3280 spin_unlock(&pa->pa_lock); 3330 spin_unlock(&pa->pa_lock);
3281 3331
3282 grp_blk = pa->pa_pstart; 3332 grp_blk = pa->pa_pstart;
3283 /* 3333 /*
3284 * If doing group-based preallocation, pa_pstart may be in the 3334 * If doing group-based preallocation, pa_pstart may be in the
3285 * next group when pa is used up 3335 * next group when pa is used up
3286 */ 3336 */
@@ -3697,7 +3747,7 @@ out:
3697 ext4_unlock_group(sb, group); 3747 ext4_unlock_group(sb, group);
3698 if (ac) 3748 if (ac)
3699 kmem_cache_free(ext4_ac_cachep, ac); 3749 kmem_cache_free(ext4_ac_cachep, ac);
3700 ext4_mb_release_desc(&e4b); 3750 ext4_mb_unload_buddy(&e4b);
3701 put_bh(bitmap_bh); 3751 put_bh(bitmap_bh);
3702 return free; 3752 return free;
3703} 3753}
@@ -3801,7 +3851,7 @@ repeat:
3801 if (bitmap_bh == NULL) { 3851 if (bitmap_bh == NULL) {
3802 ext4_error(sb, "Error reading block bitmap for %u", 3852 ext4_error(sb, "Error reading block bitmap for %u",
3803 group); 3853 group);
3804 ext4_mb_release_desc(&e4b); 3854 ext4_mb_unload_buddy(&e4b);
3805 continue; 3855 continue;
3806 } 3856 }
3807 3857
@@ -3810,7 +3860,7 @@ repeat:
3810 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3860 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
3811 ext4_unlock_group(sb, group); 3861 ext4_unlock_group(sb, group);
3812 3862
3813 ext4_mb_release_desc(&e4b); 3863 ext4_mb_unload_buddy(&e4b);
3814 put_bh(bitmap_bh); 3864 put_bh(bitmap_bh);
3815 3865
3816 list_del(&pa->u.pa_tmp_list); 3866 list_del(&pa->u.pa_tmp_list);
@@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4074 ext4_mb_release_group_pa(&e4b, pa, ac); 4124 ext4_mb_release_group_pa(&e4b, pa, ac);
4075 ext4_unlock_group(sb, group); 4125 ext4_unlock_group(sb, group);
4076 4126
4077 ext4_mb_release_desc(&e4b); 4127 ext4_mb_unload_buddy(&e4b);
4078 list_del(&pa->u.pa_tmp_list); 4128 list_del(&pa->u.pa_tmp_list);
4079 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4129 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4080 } 4130 }
@@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4484 if (!bh) 4534 if (!bh)
4485 tbh = sb_find_get_block(inode->i_sb, 4535 tbh = sb_find_get_block(inode->i_sb,
4486 block + i); 4536 block + i);
4487 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4537 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4488 inode, tbh, block + i); 4538 inode, tbh, block + i);
4489 } 4539 }
4490 } 4540 }
4491 4541
4492 /* 4542 /*
4493 * We need to make sure we don't reuse the freed block until 4543 * We need to make sure we don't reuse the freed block until
4494 * after the transaction is committed, which we can do by 4544 * after the transaction is committed, which we can do by
4495 * treating the block as metadata, below. We make an 4545 * treating the block as metadata, below. We make an
@@ -4610,7 +4660,7 @@ do_more:
4610 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4660 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4611 } 4661 }
4612 4662
4613 ext4_mb_release_desc(&e4b); 4663 ext4_mb_unload_buddy(&e4b);
4614 4664
4615 freed += count; 4665 freed += count;
4616 4666
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
475 */ 475 */
476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, 476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
477 EXT4_FEATURE_INCOMPAT_EXTENTS) || 477 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
478 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 478 (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
479 return -EINVAL; 479 return -EINVAL;
480 480
481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..52abfa12762a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
482 int depth = ext_depth(orig_inode); 482 int depth = ext_depth(orig_inode);
483 int ret; 483 int ret;
484 484
485 start_ext.ee_block = end_ext.ee_block = 0;
485 o_start = o_end = oext = orig_path[depth].p_ext; 486 o_start = o_end = oext = orig_path[depth].p_ext;
486 oext_alen = ext4_ext_get_actual_len(oext); 487 oext_alen = ext4_ext_get_actual_len(oext);
487 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
529 * new_ext |-------| 530 * new_ext |-------|
530 */ 531 */
531 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 532 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
532 ext4_error(orig_inode->i_sb, 533 EXT4_ERROR_INODE(orig_inode,
533 "new_ext_end(%u) should be less than or equal to " 534 "new_ext_end(%u) should be less than or equal to "
534 "oext->ee_block(%u) + oext_alen(%d) - 1", 535 "oext->ee_block(%u) + oext_alen(%d) - 1",
535 new_ext_end, le32_to_cpu(oext->ee_block), 536 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 while (1) { 693 while (1) {
693 /* The extent for donor must be found. */ 694 /* The extent for donor must be found. */
694 if (!dext) { 695 if (!dext) {
695 ext4_error(donor_inode->i_sb, 696 EXT4_ERROR_INODE(donor_inode,
696 "The extent for donor must be found"); 697 "The extent for donor must be found");
697 *err = -EIO; 698 *err = -EIO;
698 goto out; 699 goto out;
699 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 700 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
700 ext4_error(donor_inode->i_sb, 701 EXT4_ERROR_INODE(donor_inode,
701 "Donor offset(%u) and the first block of donor " 702 "Donor offset(%u) and the first block of donor "
702 "extent(%u) should be equal", 703 "extent(%u) should be equal",
703 donor_off, 704 donor_off,
@@ -959,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode,
959 return -EINVAL; 960 return -EINVAL;
960 } 961 }
961 962
963 if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
964 return -EPERM;
965
962 /* Ext4 move extent does not support swapfile */ 966 /* Ext4 move extent does not support swapfile */
963 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { 967 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
964 ext4_debug("ext4 move extent: The argument files should " 968 ext4_debug("ext4 move extent: The argument files should "
@@ -976,11 +980,11 @@ mext_check_arguments(struct inode *orig_inode,
976 } 980 }
977 981
978 /* Ext4 move extent supports only extent based file */ 982 /* Ext4 move extent supports only extent based file */
979 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 983 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
980 ext4_debug("ext4 move extent: orig file is not extents " 984 ext4_debug("ext4 move extent: orig file is not extents "
981 "based file [ino:orig %lu]\n", orig_inode->i_ino); 985 "based file [ino:orig %lu]\n", orig_inode->i_ino);
982 return -EOPNOTSUPP; 986 return -EOPNOTSUPP;
983 } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { 987 } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
984 ext4_debug("ext4 move extent: donor file is not extents " 988 ext4_debug("ext4 move extent: donor file is not extents "
985 "based file [ino:donor %lu]\n", donor_inode->i_ino); 989 "based file [ino:donor %lu]\n", donor_inode->i_ino);
986 return -EOPNOTSUPP; 990 return -EOPNOTSUPP;
@@ -1354,7 +1358,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1354 if (ret1 < 0) 1358 if (ret1 < 0)
1355 break; 1359 break;
1356 if (*moved_len > len) { 1360 if (*moved_len > len) {
1357 ext4_error(orig_inode->i_sb, 1361 EXT4_ERROR_INODE(orig_inode,
1358 "We replaced blocks too much! " 1362 "We replaced blocks too much! "
1359 "sum of replaced: %llu requested: %llu", 1363 "sum of replaced: %llu requested: %llu",
1360 *moved_len, len); 1364 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
187 return blocksize; 187 return blocksize;
188 return (len & 65532) | ((len & 3) << 16); 188 return (len & 65532) | ((len & 3) << 16);
189} 189}
190 190
191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) 191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
192{ 192{
193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) 193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
197 if (len == blocksize) { 197 if (len == blocksize) {
198 if (blocksize == 65536) 198 if (blocksize == 65536)
199 return cpu_to_le16(EXT4_MAX_REC_LEN); 199 return cpu_to_le16(EXT4_MAX_REC_LEN);
200 else 200 else
201 return cpu_to_le16(0); 201 return cpu_to_le16(0);
202 } 202 }
203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); 203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
349 brelse(bh); 349 brelse(bh);
350 } 350 }
351 if (bcount) 351 if (bcount)
352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
353 levels ? "" : " ", names, space/bcount, 353 levels ? "" : " ", names, space/bcount,
354 (space/bcount)*100/blocksize); 354 (space/bcount)*100/blocksize);
355 return (struct stats) { names, space, bcount}; 355 return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
653 int ret, err; 653 int ret, err;
654 __u32 hashval; 654 __u32 hashval;
655 655
656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
657 start_hash, start_minor_hash)); 657 start_hash, start_minor_hash));
658 dir = dir_file->f_path.dentry->d_inode; 658 dir = dir_file->f_path.dentry->d_inode;
659 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 659 if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
661 if (hinfo.hash_version <= DX_HASH_TEA) 661 if (hinfo.hash_version <= DX_HASH_TEA)
662 hinfo.hash_version += 662 hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
801{ 801{
802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
803 EXT4_FEATURE_COMPAT_DIR_INDEX)) 803 EXT4_FEATURE_COMPAT_DIR_INDEX))
804 EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; 804 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
805} 805}
806 806
807/* 807/*
@@ -943,8 +943,8 @@ restart:
943 wait_on_buffer(bh); 943 wait_on_buffer(bh);
944 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
945 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
946 ext4_error(sb, "reading directory #%lu offset %lu", 946 EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
947 dir->i_ino, (unsigned long)block); 947 (unsigned long) block);
948 brelse(bh); 948 brelse(bh);
949 goto next; 949 goto next;
950 } 950 }
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1066 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1067 brelse(bh); 1067 brelse(bh);
1068 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1069 ext4_error(dir->i_sb, "bad inode number: %u", ino); 1069 EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1070 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1071 } 1071 }
1072 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1073 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1074 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1075 ext4_error(dir->i_sb, 1075 EXT4_ERROR_INODE(dir,
1076 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1077 ino); 1077 ino);
1078 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
1079 } else { 1079 } else {
1080 return ERR_CAST(inode); 1080 return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
1104 brelse(bh); 1104 brelse(bh);
1105 1105
1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1107 ext4_error(child->d_inode->i_sb, 1107 EXT4_ERROR_INODE(child->d_inode,
1108 "bad inode number: %u", ino); 1108 "bad parent inode number: %u", ino);
1109 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1110 } 1110 }
1111 1111
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1141 unsigned rec_len = 0; 1141 unsigned rec_len = 0;
1142 1142
1143 while (count--) { 1143 while (count--) {
1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1145 (from + (map->offs<<2)); 1145 (from + (map->offs<<2));
1146 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1146 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1147 memcpy (to, de, rec_len); 1147 memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1404 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1405 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1406 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1407 ext4_error(dir->i_sb, 1407 EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1408 "invalid rec_len for '..' in inode %lu",
1409 dir->i_ino);
1410 brelse(bh); 1408 brelse(bh);
1411 return -EIO; 1409 return -EIO;
1412 } 1410 }
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1418 brelse(bh); 1416 brelse(bh);
1419 return retval; 1417 return retval;
1420 } 1418 }
1421 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; 1419 ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1422 data1 = bh2->b_data; 1420 data1 = bh2->b_data;
1423 1421
1424 memcpy (data1, de, len); 1422 memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1491 retval = ext4_dx_add_entry(handle, dentry, inode); 1489 retval = ext4_dx_add_entry(handle, dentry, inode);
1492 if (!retval || (retval != ERR_BAD_DX_DIR)) 1490 if (!retval || (retval != ERR_BAD_DX_DIR))
1493 return retval; 1491 return retval;
1494 EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; 1492 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1495 dx_fallback++; 1493 dx_fallback++;
1496 ext4_mark_inode_dirty(handle, dir); 1494 ext4_mark_inode_dirty(handle, dir);
1497 } 1495 }
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1519 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1517 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1520 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1518 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1521 brelse(bh); 1519 brelse(bh);
1520 if (retval == 0)
1521 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1522 return retval; 1522 return retval;
1523} 1523}
1524 1524
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1917 if (err) 1917 if (err)
1918 ext4_error(inode->i_sb, 1918 EXT4_ERROR_INODE(inode,
1919 "error %d reading directory #%lu offset 0", 1919 "error %d reading directory lblock 0", err);
1920 err, inode->i_ino);
1921 else 1920 else
1922 ext4_warning(inode->i_sb, 1921 ext4_warning(inode->i_sb,
1923 "bad directory (dir #%lu) - no data block", 1922 "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
1941 de = ext4_next_entry(de1, sb->s_blocksize); 1940 de = ext4_next_entry(de1, sb->s_blocksize);
1942 while (offset < inode->i_size) { 1941 while (offset < inode->i_size) {
1943 if (!bh || 1942 if (!bh ||
1944 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1943 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1944 unsigned int lblock;
1945 err = 0; 1945 err = 0;
1946 brelse(bh); 1946 brelse(bh);
1947 bh = ext4_bread(NULL, inode, 1947 lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
1948 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1948 bh = ext4_bread(NULL, inode, lblock, 0, &err);
1949 if (!bh) { 1949 if (!bh) {
1950 if (err) 1950 if (err)
1951 ext4_error(sb, 1951 EXT4_ERROR_INODE(inode,
1952 "error %d reading directory" 1952 "error %d reading directory "
1953 " #%lu offset %u", 1953 "lblock %u", err, lblock);
1954 err, inode->i_ino, offset);
1955 offset += sb->s_blocksize; 1954 offset += sb->s_blocksize;
1956 continue; 1955 continue;
1957 } 1956 }
@@ -2297,7 +2296,7 @@ retry:
2297 } 2296 }
2298 } else { 2297 } else {
2299 /* clear the extent format for fast symlink */ 2298 /* clear the extent format for fast symlink */
2300 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2299 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2301 inode->i_op = &ext4_fast_symlink_inode_operations; 2300 inode->i_op = &ext4_fast_symlink_inode_operations;
2302 memcpy((char *)&EXT4_I(inode)->i_data, symname, l); 2301 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2303 inode->i_size = l-1; 2302 inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
911 percpu_counter_add(&sbi->s_freeinodes_counter, 911 percpu_counter_add(&sbi->s_freeinodes_counter,
912 EXT4_INODES_PER_GROUP(sb)); 912 EXT4_INODES_PER_GROUP(sb));
913 913
914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
915 sbi->s_log_groups_per_flex) {
915 ext4_group_t flex_group; 916 ext4_group_t flex_group;
916 flex_group = ext4_flex_group(sbi, input->group); 917 flex_group = ext4_flex_group(sbi, input->group);
917 atomic_add(input->free_blocks_count, 918 atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d5..4e8983a9811b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
241 if (sb->s_flags & MS_RDONLY) 241 if (sb->s_flags & MS_RDONLY)
242 return ERR_PTR(-EROFS); 242 return ERR_PTR(-EROFS);
243 243
244 vfs_check_frozen(sb, SB_FREEZE_WRITE);
244 /* Special case here: if the journal has aborted behind our 245 /* Special case here: if the journal has aborted behind our
245 * backs (eg. EIO in the commit thread), then we still need to 246 * backs (eg. EIO in the commit thread), then we still need to
246 * take the FS itself readonly cleanly. */ 247 * take the FS itself readonly cleanly. */
@@ -645,6 +646,8 @@ static void ext4_put_super(struct super_block *sb)
645 struct ext4_super_block *es = sbi->s_es; 646 struct ext4_super_block *es = sbi->s_es;
646 int i, err; 647 int i, err;
647 648
649 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
650
648 flush_workqueue(sbi->dio_unwritten_wq); 651 flush_workqueue(sbi->dio_unwritten_wq);
649 destroy_workqueue(sbi->dio_unwritten_wq); 652 destroy_workqueue(sbi->dio_unwritten_wq);
650 653
@@ -941,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
941 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); 944 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
942 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 945 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
943 seq_puts(seq, ",journal_async_commit"); 946 seq_puts(seq, ",journal_async_commit");
947 else if (test_opt(sb, JOURNAL_CHECKSUM))
948 seq_puts(seq, ",journal_checksum");
944 if (test_opt(sb, NOBH)) 949 if (test_opt(sb, NOBH))
945 seq_puts(seq, ",nobh"); 950 seq_puts(seq, ",nobh");
946 if (test_opt(sb, I_VERSION)) 951 if (test_opt(sb, I_VERSION))
@@ -1059,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot);
1059static int ext4_mark_dquot_dirty(struct dquot *dquot); 1064static int ext4_mark_dquot_dirty(struct dquot *dquot);
1060static int ext4_write_info(struct super_block *sb, int type); 1065static int ext4_write_info(struct super_block *sb, int type);
1061static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1066static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1062 char *path, int remount); 1067 char *path);
1063static int ext4_quota_on_mount(struct super_block *sb, int type); 1068static int ext4_quota_on_mount(struct super_block *sb, int type);
1064static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1069static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1065 size_t len, loff_t off); 1070 size_t len, loff_t off);
@@ -1081,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = {
1081 1086
1082static const struct quotactl_ops ext4_qctl_operations = { 1087static const struct quotactl_ops ext4_qctl_operations = {
1083 .quota_on = ext4_quota_on, 1088 .quota_on = ext4_quota_on,
1084 .quota_off = vfs_quota_off, 1089 .quota_off = dquot_quota_off,
1085 .quota_sync = vfs_quota_sync, 1090 .quota_sync = dquot_quota_sync,
1086 .get_info = vfs_get_dqinfo, 1091 .get_info = dquot_get_dqinfo,
1087 .set_info = vfs_set_dqinfo, 1092 .set_info = dquot_set_dqinfo,
1088 .get_dqblk = vfs_get_dqblk, 1093 .get_dqblk = dquot_get_dqblk,
1089 .set_dqblk = vfs_set_dqblk 1094 .set_dqblk = dquot_set_dqblk
1090}; 1095};
1091#endif 1096#endif
1092 1097
@@ -2051,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2051 /* Turn quotas off */ 2056 /* Turn quotas off */
2052 for (i = 0; i < MAXQUOTAS; i++) { 2057 for (i = 0; i < MAXQUOTAS; i++) {
2053 if (sb_dqopt(sb)->files[i]) 2058 if (sb_dqopt(sb)->files[i])
2054 vfs_quota_off(sb, i, 0); 2059 dquot_quota_off(sb, i);
2055 } 2060 }
2056#endif 2061#endif
2057 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 2062 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2213,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2213struct ext4_attr { 2218struct ext4_attr {
2214 struct attribute attr; 2219 struct attribute attr;
2215 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2220 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2216 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2221 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2217 const char *, size_t); 2222 const char *, size_t);
2218 int offset; 2223 int offset;
2219}; 2224};
@@ -2430,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2430 __releases(kernel_lock) 2435 __releases(kernel_lock)
2431 __acquires(kernel_lock) 2436 __acquires(kernel_lock)
2432{ 2437{
2438 char *orig_data = kstrdup(data, GFP_KERNEL);
2433 struct buffer_head *bh; 2439 struct buffer_head *bh;
2434 struct ext4_super_block *es = NULL; 2440 struct ext4_super_block *es = NULL;
2435 struct ext4_sb_info *sbi; 2441 struct ext4_sb_info *sbi;
@@ -2793,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2793 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2799 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2794 spin_lock_init(&sbi->s_next_gen_lock); 2800 spin_lock_init(&sbi->s_next_gen_lock);
2795 2801
2796 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2797 ext4_count_free_blocks(sb));
2798 if (!err) {
2799 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2800 ext4_count_free_inodes(sb));
2801 }
2802 if (!err) {
2803 err = percpu_counter_init(&sbi->s_dirs_counter,
2804 ext4_count_dirs(sb));
2805 }
2806 if (!err) {
2807 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2808 }
2809 if (err) {
2810 ext4_msg(sb, KERN_ERR, "insufficient memory");
2811 goto failed_mount3;
2812 }
2813
2814 sbi->s_stripe = ext4_get_stripe_size(sbi); 2802 sbi->s_stripe = ext4_get_stripe_size(sbi);
2815 sbi->s_max_writeback_mb_bump = 128; 2803 sbi->s_max_writeback_mb_bump = 128;
2816 2804
@@ -2910,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2910 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2898 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2911 2899
2912no_journal: 2900no_journal:
2901 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2902 ext4_count_free_blocks(sb));
2903 if (!err)
2904 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2905 ext4_count_free_inodes(sb));
2906 if (!err)
2907 err = percpu_counter_init(&sbi->s_dirs_counter,
2908 ext4_count_dirs(sb));
2909 if (!err)
2910 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2911 if (err) {
2912 ext4_msg(sb, KERN_ERR, "insufficient memory");
2913 goto failed_mount_wq;
2914 }
2913 if (test_opt(sb, NOBH)) { 2915 if (test_opt(sb, NOBH)) {
2914 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2916 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2915 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2917 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3001,7 +3003,7 @@ no_journal:
3001 err = ext4_setup_system_zone(sb); 3003 err = ext4_setup_system_zone(sb);
3002 if (err) { 3004 if (err) {
3003 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3005 ext4_msg(sb, KERN_ERR, "failed to initialize system "
3004 "zone (%d)\n", err); 3006 "zone (%d)", err);
3005 goto failed_mount4; 3007 goto failed_mount4;
3006 } 3008 }
3007 3009
@@ -3040,9 +3042,11 @@ no_journal:
3040 } else 3042 } else
3041 descr = "out journal"; 3043 descr = "out journal";
3042 3044
3043 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr); 3045 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
3046 "Opts: %s", descr, orig_data);
3044 3047
3045 lock_kernel(); 3048 lock_kernel();
3049 kfree(orig_data);
3046 return 0; 3050 return 0;
3047 3051
3048cantfind_ext4: 3052cantfind_ext4:
@@ -3059,6 +3063,10 @@ failed_mount_wq:
3059 jbd2_journal_destroy(sbi->s_journal); 3063 jbd2_journal_destroy(sbi->s_journal);
3060 sbi->s_journal = NULL; 3064 sbi->s_journal = NULL;
3061 } 3065 }
3066 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3067 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3068 percpu_counter_destroy(&sbi->s_dirs_counter);
3069 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3062failed_mount3: 3070failed_mount3:
3063 if (sbi->s_flex_groups) { 3071 if (sbi->s_flex_groups) {
3064 if (is_vmalloc_addr(sbi->s_flex_groups)) 3072 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3074,6 @@ failed_mount3:
3066 else 3074 else
3067 kfree(sbi->s_flex_groups); 3075 kfree(sbi->s_flex_groups);
3068 } 3076 }
3069 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3070 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3071 percpu_counter_destroy(&sbi->s_dirs_counter);
3072 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3073failed_mount2: 3077failed_mount2:
3074 for (i = 0; i < db_count; i++) 3078 for (i = 0; i < db_count; i++)
3075 brelse(sbi->s_group_desc[i]); 3079 brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3093,7 @@ out_fail:
3089 kfree(sbi->s_blockgroup_lock); 3093 kfree(sbi->s_blockgroup_lock);
3090 kfree(sbi); 3094 kfree(sbi);
3091 lock_kernel(); 3095 lock_kernel();
3096 kfree(orig_data);
3092 return ret; 3097 return ret;
3093} 3098}
3094 3099
@@ -3380,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3380 if (!(sb->s_flags & MS_RDONLY)) 3385 if (!(sb->s_flags & MS_RDONLY))
3381 es->s_wtime = cpu_to_le32(get_seconds()); 3386 es->s_wtime = cpu_to_le32(get_seconds());
3382 es->s_kbytes_written = 3387 es->s_kbytes_written =
3383 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3388 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3384 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3389 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3385 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 3390 EXT4_SB(sb)->s_sectors_written_start) >> 1));
3386 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3391 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3485,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb)
3485 return 0; 3490 return 0;
3486 3491
3487 journal = EXT4_SB(sb)->s_journal; 3492 journal = EXT4_SB(sb)->s_journal;
3488 if (journal) 3493 if (journal) {
3494 vfs_check_frozen(sb, SB_FREEZE_WRITE);
3489 ret = ext4_journal_force_commit(journal); 3495 ret = ext4_journal_force_commit(journal);
3496 }
3490 3497
3491 return ret; 3498 return ret;
3492} 3499}
@@ -3535,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb)
3535 * the journal. 3542 * the journal.
3536 */ 3543 */
3537 error = jbd2_journal_flush(journal); 3544 error = jbd2_journal_flush(journal);
3538 if (error < 0) { 3545 if (error < 0)
3539 out: 3546 goto out;
3540 jbd2_journal_unlock_updates(journal);
3541 return error;
3542 }
3543 3547
3544 /* Journal blocked and flushed, clear needs_recovery flag. */ 3548 /* Journal blocked and flushed, clear needs_recovery flag. */
3545 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3549 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3546 error = ext4_commit_super(sb, 1); 3550 error = ext4_commit_super(sb, 1);
3547 if (error) 3551out:
3548 goto out; 3552 /* we rely on s_frozen to stop further updates */
3549 return 0; 3553 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3554 return error;
3550} 3555}
3551 3556
3552/* 3557/*
@@ -3563,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb)
3563 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3568 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3564 ext4_commit_super(sb, 1); 3569 ext4_commit_super(sb, 1);
3565 unlock_super(sb); 3570 unlock_super(sb);
3566 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3567 return 0; 3571 return 0;
3568} 3572}
3569 3573
@@ -3574,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3574 ext4_fsblk_t n_blocks_count = 0; 3578 ext4_fsblk_t n_blocks_count = 0;
3575 unsigned long old_sb_flags; 3579 unsigned long old_sb_flags;
3576 struct ext4_mount_options old_opts; 3580 struct ext4_mount_options old_opts;
3581 int enable_quota = 0;
3577 ext4_group_t g; 3582 ext4_group_t g;
3578 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3583 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3579 int err; 3584 int err;
3580#ifdef CONFIG_QUOTA 3585#ifdef CONFIG_QUOTA
3581 int i; 3586 int i;
3582#endif 3587#endif
3588 char *orig_data = kstrdup(data, GFP_KERNEL);
3583 3589
3584 lock_kernel(); 3590 lock_kernel();
3585 3591
@@ -3630,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3630 } 3636 }
3631 3637
3632 if (*flags & MS_RDONLY) { 3638 if (*flags & MS_RDONLY) {
3639 err = dquot_suspend(sb, -1);
3640 if (err < 0)
3641 goto restore_opts;
3642
3633 /* 3643 /*
3634 * First of all, the unconditional stuff we have to do 3644 * First of all, the unconditional stuff we have to do
3635 * to disable replay of the journal when we next remount 3645 * to disable replay of the journal when we next remount
@@ -3698,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3698 goto restore_opts; 3708 goto restore_opts;
3699 if (!ext4_setup_super(sb, es, 0)) 3709 if (!ext4_setup_super(sb, es, 0))
3700 sb->s_flags &= ~MS_RDONLY; 3710 sb->s_flags &= ~MS_RDONLY;
3711 enable_quota = 1;
3701 } 3712 }
3702 } 3713 }
3703 ext4_setup_system_zone(sb); 3714 ext4_setup_system_zone(sb);
@@ -3713,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3713#endif 3724#endif
3714 unlock_super(sb); 3725 unlock_super(sb);
3715 unlock_kernel(); 3726 unlock_kernel();
3727 if (enable_quota)
3728 dquot_resume(sb, -1);
3729
3730 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
3731 kfree(orig_data);
3716 return 0; 3732 return 0;
3717 3733
3718restore_opts: 3734restore_opts:
@@ -3734,6 +3750,7 @@ restore_opts:
3734#endif 3750#endif
3735 unlock_super(sb); 3751 unlock_super(sb);
3736 unlock_kernel(); 3752 unlock_kernel();
3753 kfree(orig_data);
3737 return err; 3754 return err;
3738} 3755}
3739 3756
@@ -3906,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type)
3906 */ 3923 */
3907static int ext4_quota_on_mount(struct super_block *sb, int type) 3924static int ext4_quota_on_mount(struct super_block *sb, int type)
3908{ 3925{
3909 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], 3926 return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
3910 EXT4_SB(sb)->s_jquota_fmt, type); 3927 EXT4_SB(sb)->s_jquota_fmt, type);
3911} 3928}
3912 3929
3913/* 3930/*
3914 * Standard function to be called on quota_on 3931 * Standard function to be called on quota_on
3915 */ 3932 */
3916static int ext4_quota_on(struct super_block *sb, int type, int format_id, 3933static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3917 char *name, int remount) 3934 char *name)
3918{ 3935{
3919 int err; 3936 int err;
3920 struct path path; 3937 struct path path;
3921 3938
3922 if (!test_opt(sb, QUOTA)) 3939 if (!test_opt(sb, QUOTA))
3923 return -EINVAL; 3940 return -EINVAL;
3924 /* When remounting, no checks are needed and in fact, name is NULL */
3925 if (remount)
3926 return vfs_quota_on(sb, type, format_id, name, remount);
3927 3941
3928 err = kern_path(name, LOOKUP_FOLLOW, &path); 3942 err = kern_path(name, LOOKUP_FOLLOW, &path);
3929 if (err) 3943 if (err)
@@ -3962,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3962 } 3976 }
3963 } 3977 }
3964 3978
3965 err = vfs_quota_on_path(sb, type, format_id, &path); 3979 err = dquot_quota_on_path(sb, type, format_id, &path);
3966 path_put(&path); 3980 path_put(&path);
3967 return err; 3981 return err;
3968} 3982}
@@ -4141,6 +4155,7 @@ static int __init init_ext4_fs(void)
4141{ 4155{
4142 int err; 4156 int err;
4143 4157
4158 ext4_check_flag_values();
4144 err = init_ext4_system_zone(); 4159 err = init_ext4_system_zone();
4145 if (err) 4160 if (err)
4146 return err; 4161 return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext4_setattr,
37#ifdef CONFIG_EXT4_FS_XATTR 38#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 46const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 48 .follow_link = ext4_follow_link,
49 .setattr = ext4_setattr,
48#ifdef CONFIG_EXT4_FS_XATTR 50#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2de0e9515089..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: 230bad_block:
231 ext4_error(inode->i_sb, 231 EXT4_ERROR_INODE(inode, "bad block %llu",
232 "inode %lu: bad block %llu", inode->i_ino, 232 EXT4_I(inode)->i_file_acl);
233 EXT4_I(inode)->i_file_acl);
234 error = -EIO; 233 error = -EIO;
235 goto cleanup; 234 goto cleanup;
236 } 235 }
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
372 ea_bdebug(bh, "b_count=%d, refcount=%d", 371 ea_bdebug(bh, "b_count=%d, refcount=%d",
373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
374 if (ext4_xattr_check_block(bh)) { 373 if (ext4_xattr_check_block(bh)) {
375 ext4_error(inode->i_sb, 374 EXT4_ERROR_INODE(inode, "bad block %llu",
376 "inode %lu: bad block %llu", inode->i_ino, 375 EXT4_I(inode)->i_file_acl);
377 EXT4_I(inode)->i_file_acl);
378 error = -EIO; 376 error = -EIO;
379 goto cleanup; 377 goto cleanup;
380 } 378 }
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
666 atomic_read(&(bs->bh->b_count)), 664 atomic_read(&(bs->bh->b_count)),
667 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 665 le32_to_cpu(BHDR(bs->bh)->h_refcount));
668 if (ext4_xattr_check_block(bs->bh)) { 666 if (ext4_xattr_check_block(bs->bh)) {
669 ext4_error(sb, "inode %lu: bad block %llu", 667 EXT4_ERROR_INODE(inode, "bad block %llu",
670 inode->i_ino, EXT4_I(inode)->i_file_acl); 668 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 669 error = -EIO;
672 goto cleanup; 670 goto cleanup;
673 } 671 }
@@ -820,7 +818,7 @@ inserted:
820 EXT4_I(inode)->i_block_group); 818 EXT4_I(inode)->i_block_group);
821 819
822 /* non-extent files can't have physical blocks past 2^32 */ 820 /* non-extent files can't have physical blocks past 2^32 */
823 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 821 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
824 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 822 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
825 823
826 block = ext4_new_meta_blocks(handle, inode, 824 block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
828 if (error) 826 if (error)
829 goto cleanup; 827 goto cleanup;
830 828
831 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 829 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
832 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); 830 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
833 831
834 ea_idebug(inode, "creating block %d", block); 832 ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
880 goto cleanup; 878 goto cleanup;
881 879
882bad_block: 880bad_block:
883 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 881 EXT4_ERROR_INODE(inode, "bad block %llu",
884 inode->i_ino, EXT4_I(inode)->i_file_acl); 882 EXT4_I(inode)->i_file_acl);
885 goto cleanup; 883 goto cleanup;
886 884
887#undef header 885#undef header
@@ -1194,8 +1192,8 @@ retry:
1194 if (!bh) 1192 if (!bh)
1195 goto cleanup; 1193 goto cleanup;
1196 if (ext4_xattr_check_block(bh)) { 1194 if (ext4_xattr_check_block(bh)) {
1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1195 EXT4_ERROR_INODE(inode, "bad block %llu",
1198 inode->i_ino, EXT4_I(inode)->i_file_acl); 1196 EXT4_I(inode)->i_file_acl);
1199 error = -EIO; 1197 error = -EIO;
1200 goto cleanup; 1198 goto cleanup;
1201 } 1199 }
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1372 goto cleanup; 1370 goto cleanup;
1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1371 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1374 if (!bh) { 1372 if (!bh) {
1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error", 1373 EXT4_ERROR_INODE(inode, "block %llu read error",
1376 inode->i_ino, EXT4_I(inode)->i_file_acl); 1374 EXT4_I(inode)->i_file_acl);
1377 goto cleanup; 1375 goto cleanup;
1378 } 1376 }
1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1377 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1378 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1379 EXT4_ERROR_INODE(inode, "bad block %llu",
1382 inode->i_ino, EXT4_I(inode)->i_file_acl); 1380 EXT4_I(inode)->i_file_acl);
1383 goto cleanup; 1381 goto cleanup;
1384 } 1382 }
1385 ext4_xattr_release_block(handle, inode, bh); 1383 ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
1504 } 1502 }
1505 bh = sb_bread(inode->i_sb, ce->e_block); 1503 bh = sb_bread(inode->i_sb, ce->e_block);
1506 if (!bh) { 1504 if (!bh) {
1507 ext4_error(inode->i_sb, 1505 EXT4_ERROR_INODE(inode, "block %lu read error",
1508 "inode %lu: block %lu read error", 1506 (unsigned long) ce->e_block);
1509 inode->i_ino, (unsigned long) ce->e_block);
1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1507 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1511 EXT4_XATTR_REFCOUNT_MAX) { 1508 EXT4_XATTR_REFCOUNT_MAX) {
1512 ea_idebug(inode, "block %lu refcount %d>=%d", 1509 ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 53dba57b49a1..27ac25725954 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -306,11 +306,11 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
306extern const struct file_operations fat_file_operations; 306extern const struct file_operations fat_file_operations;
307extern const struct inode_operations fat_file_inode_operations; 307extern const struct inode_operations fat_file_inode_operations;
308extern int fat_setattr(struct dentry * dentry, struct iattr * attr); 308extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
309extern void fat_truncate(struct inode *inode); 309extern int fat_setsize(struct inode *inode, loff_t offset);
310extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
310extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, 311extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
311 struct kstat *stat); 312 struct kstat *stat);
312extern int fat_file_fsync(struct file *file, struct dentry *dentry, 313extern int fat_file_fsync(struct file *file, int datasync);
313 int datasync);
314 314
315/* fat/inode.c */ 315/* fat/inode.c */
316extern void fat_attach(struct inode *inode, loff_t i_pos); 316extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a14c2f6a489e..990dfae022e5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -149,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
149 return 0; 149 return 0;
150} 150}
151 151
152int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 152int fat_file_fsync(struct file *filp, int datasync)
153{ 153{
154 struct inode *inode = dentry->d_inode; 154 struct inode *inode = filp->f_mapping->host;
155 int res, err; 155 int res, err;
156 156
157 res = simple_fsync(filp, dentry, datasync); 157 res = generic_file_fsync(filp, datasync);
158 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); 158 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
159 159
160 return res ? res : err; 160 return res ? res : err;
@@ -283,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
283 return fat_free_clusters(inode, free_start); 283 return fat_free_clusters(inode, free_start);
284} 284}
285 285
286void fat_truncate(struct inode *inode) 286void fat_truncate_blocks(struct inode *inode, loff_t offset)
287{ 287{
288 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 288 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
289 const unsigned int cluster_size = sbi->cluster_size; 289 const unsigned int cluster_size = sbi->cluster_size;
@@ -293,10 +293,10 @@ void fat_truncate(struct inode *inode)
293 * This protects against truncating a file bigger than it was then 293 * This protects against truncating a file bigger than it was then
294 * trying to write into the hole. 294 * trying to write into the hole.
295 */ 295 */
296 if (MSDOS_I(inode)->mmu_private > inode->i_size) 296 if (MSDOS_I(inode)->mmu_private > offset)
297 MSDOS_I(inode)->mmu_private = inode->i_size; 297 MSDOS_I(inode)->mmu_private = offset;
298 298
299 nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; 299 nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
300 300
301 fat_free(inode, nr_clusters); 301 fat_free(inode, nr_clusters);
302 fat_flush_inodes(inode->i_sb, inode, NULL); 302 fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -364,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
364 return 0; 364 return 0;
365} 365}
366 366
367int fat_setsize(struct inode *inode, loff_t offset)
368{
369 int error;
370
371 error = simple_setsize(inode, offset);
372 if (error)
373 return error;
374 fat_truncate_blocks(inode, offset);
375
376 return error;
377}
378
367#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) 379#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
368/* valid file mode bits */ 380/* valid file mode bits */
369#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) 381#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -378,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
378 /* 390 /*
379 * Expand the file. Since inode_setattr() updates ->i_size 391 * Expand the file. Since inode_setattr() updates ->i_size
380 * before calling the ->truncate(), but FAT needs to fill the 392 * before calling the ->truncate(), but FAT needs to fill the
381 * hole before it. 393 * hole before it. XXX: this is no longer true with new truncate
394 * sequence.
382 */ 395 */
383 if (attr->ia_valid & ATTR_SIZE) { 396 if (attr->ia_valid & ATTR_SIZE) {
384 if (attr->ia_size > inode->i_size) { 397 if (attr->ia_size > inode->i_size) {
@@ -427,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
427 attr->ia_valid &= ~ATTR_MODE; 440 attr->ia_valid &= ~ATTR_MODE;
428 } 441 }
429 442
430 if (attr->ia_valid) 443 if (attr->ia_valid & ATTR_SIZE) {
431 error = inode_setattr(inode, attr); 444 error = fat_setsize(inode, attr->ia_size);
445 if (error)
446 goto out;
447 }
448
449 generic_setattr(inode, attr);
450 mark_inode_dirty(inode);
432out: 451out:
433 return error; 452 return error;
434} 453}
435EXPORT_SYMBOL_GPL(fat_setattr); 454EXPORT_SYMBOL_GPL(fat_setattr);
436 455
437const struct inode_operations fat_file_inode_operations = { 456const struct inode_operations fat_file_inode_operations = {
438 .truncate = fat_truncate,
439 .setattr = fat_setattr, 457 .setattr = fat_setattr,
440 .getattr = fat_getattr, 458 .getattr = fat_getattr,
441}; 459};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ed33904926ee..7bf45aee56d7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block); 142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
143} 143}
144 144
145static void fat_write_failed(struct address_space *mapping, loff_t to)
146{
147 struct inode *inode = mapping->host;
148
149 if (to > inode->i_size) {
150 truncate_pagecache(inode, to, inode->i_size);
151 fat_truncate_blocks(inode, inode->i_size);
152 }
153}
154
145static int fat_write_begin(struct file *file, struct address_space *mapping, 155static int fat_write_begin(struct file *file, struct address_space *mapping,
146 loff_t pos, unsigned len, unsigned flags, 156 loff_t pos, unsigned len, unsigned flags,
147 struct page **pagep, void **fsdata) 157 struct page **pagep, void **fsdata)
148{ 158{
159 int err;
160
149 *pagep = NULL; 161 *pagep = NULL;
150 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 162 err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
151 fat_get_block, 163 pagep, fsdata, fat_get_block,
152 &MSDOS_I(mapping->host)->mmu_private); 164 &MSDOS_I(mapping->host)->mmu_private);
165 if (err < 0)
166 fat_write_failed(mapping, pos + len);
167 return err;
153} 168}
154 169
155static int fat_write_end(struct file *file, struct address_space *mapping, 170static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
159 struct inode *inode = mapping->host; 174 struct inode *inode = mapping->host;
160 int err; 175 int err;
161 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); 176 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
177 if (err < len)
178 fat_write_failed(mapping, pos + len);
162 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { 179 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
163 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 180 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
164 MSDOS_I(inode)->i_attrs |= ATTR_ARCH; 181 MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
172 loff_t offset, unsigned long nr_segs) 189 loff_t offset, unsigned long nr_segs)
173{ 190{
174 struct file *file = iocb->ki_filp; 191 struct file *file = iocb->ki_filp;
175 struct inode *inode = file->f_mapping->host; 192 struct address_space *mapping = file->f_mapping;
193 struct inode *inode = mapping->host;
194 ssize_t ret;
176 195
177 if (rw == WRITE) { 196 if (rw == WRITE) {
178 /* 197 /*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
193 * FAT need to use the DIO_LOCKING for avoiding the race 212 * FAT need to use the DIO_LOCKING for avoiding the race
194 * condition of fat_get_block() and ->truncate(). 213 * condition of fat_get_block() and ->truncate().
195 */ 214 */
196 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 215 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
197 offset, nr_segs, fat_get_block, NULL); 216 iov, offset, nr_segs, fat_get_block, NULL);
217 if (ret < 0 && (rw & WRITE))
218 fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
219
220 return ret;
198} 221}
199 222
200static sector_t _fat_bmap(struct address_space *mapping, sector_t block) 223static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode)
429{ 452{
430 truncate_inode_pages(&inode->i_data, 0); 453 truncate_inode_pages(&inode->i_data, 0);
431 inode->i_size = 0; 454 inode->i_size = 0;
432 fat_truncate(inode); 455 fat_truncate_blocks(inode, 0);
433 clear_inode(inode); 456 clear_inode(inode);
434} 457}
435 458
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f74d270ba155..51e11bf5708f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -274,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
274 274
275 ret = copy_from_user(&owner, owner_p, sizeof(owner)); 275 ret = copy_from_user(&owner, owner_p, sizeof(owner));
276 if (ret) 276 if (ret)
277 return ret; 277 return -EFAULT;
278 278
279 switch (owner.type) { 279 switch (owner.type) {
280 case F_OWNER_TID: 280 case F_OWNER_TID:
@@ -332,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
332 } 332 }
333 read_unlock(&filp->f_owner.lock); 333 read_unlock(&filp->f_owner.lock);
334 334
335 if (!ret) 335 if (!ret) {
336 ret = copy_to_user(owner_p, &owner, sizeof(owner)); 336 ret = copy_to_user(owner_p, &owner, sizeof(owner));
337 if (ret)
338 ret = -EFAULT;
339 }
337 return ret; 340 return ret;
338} 341}
339 342
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..5c7d10ead4ad 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
194} 194}
195EXPORT_SYMBOL(alloc_file); 195EXPORT_SYMBOL(alloc_file);
196 196
197void fput(struct file *file)
198{
199 if (atomic_long_dec_and_test(&file->f_count))
200 __fput(file);
201}
202
203EXPORT_SYMBOL(fput);
204
205/** 197/**
206 * drop_file_write_access - give up ability to write to a file 198 * drop_file_write_access - give up ability to write to a file
207 * @file: the file to which we will stop writing 199 * @file: the file to which we will stop writing
@@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file)
227} 219}
228EXPORT_SYMBOL_GPL(drop_file_write_access); 220EXPORT_SYMBOL_GPL(drop_file_write_access);
229 221
230/* __fput is called from task context when aio completion releases the last 222/* the real guts of fput() - releasing the last reference to file
231 * last use of a struct file *. Do not use otherwise.
232 */ 223 */
233void __fput(struct file *file) 224static void __fput(struct file *file)
234{ 225{
235 struct dentry *dentry = file->f_path.dentry; 226 struct dentry *dentry = file->f_path.dentry;
236 struct vfsmount *mnt = file->f_path.mnt; 227 struct vfsmount *mnt = file->f_path.mnt;
@@ -268,6 +259,14 @@ void __fput(struct file *file)
268 mntput(mnt); 259 mntput(mnt);
269} 260}
270 261
262void fput(struct file *file)
263{
264 if (atomic_long_dec_and_test(&file->f_count))
265 __fput(file);
266}
267
268EXPORT_SYMBOL(fput);
269
271struct file *fget(unsigned int fd) 270struct file *fget(unsigned int fd)
272{ 271{
273 struct file *file; 272 struct file *file;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
57}; 57};
58 58
59const struct file_operations vxfs_dir_operations = { 59const struct file_operations vxfs_dir_operations = {
60 .llseek = generic_file_llseek,
61 .read = generic_read_dir,
60 .readdir = vxfs_readdir, 62 .readdir = vxfs_readdir,
61}; 63};
62 64
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ea8592b90696..1d1088f48bc2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,7 +45,6 @@ struct wb_writeback_args {
45 unsigned int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 unsigned int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 unsigned int for_background:1; 47 unsigned int for_background:1;
48 unsigned int sb_pinned:1;
49}; 48};
50 49
51/* 50/*
@@ -193,8 +192,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
193} 192}
194 193
195static void bdi_alloc_queue_work(struct backing_dev_info *bdi, 194static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
196 struct wb_writeback_args *args, 195 struct wb_writeback_args *args)
197 int wait)
198{ 196{
199 struct bdi_work *work; 197 struct bdi_work *work;
200 198
@@ -206,8 +204,6 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
206 if (work) { 204 if (work) {
207 bdi_work_init(work, args); 205 bdi_work_init(work, args);
208 bdi_queue_work(bdi, work); 206 bdi_queue_work(bdi, work);
209 if (wait)
210 bdi_wait_on_work_clear(work);
211 } else { 207 } else {
212 struct bdi_writeback *wb = &bdi->wb; 208 struct bdi_writeback *wb = &bdi->wb;
213 209
@@ -234,11 +230,6 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
234 .sync_mode = WB_SYNC_ALL, 230 .sync_mode = WB_SYNC_ALL,
235 .nr_pages = LONG_MAX, 231 .nr_pages = LONG_MAX,
236 .range_cyclic = 0, 232 .range_cyclic = 0,
237 /*
238 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
239 * lets make it explicitly clear.
240 */
241 .sb_pinned = 1,
242 }; 233 };
243 struct bdi_work work; 234 struct bdi_work work;
244 235
@@ -254,23 +245,21 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
254 * @bdi: the backing device to write from 245 * @bdi: the backing device to write from
255 * @sb: write inodes from this super_block 246 * @sb: write inodes from this super_block
256 * @nr_pages: the number of pages to write 247 * @nr_pages: the number of pages to write
257 * @sb_locked: caller already holds sb umount sem.
258 * 248 *
259 * Description: 249 * Description:
260 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 250 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
261 * started when this function returns, we make no guarentees on 251 * started when this function returns, we make no guarentees on
262 * completion. Caller specifies whether sb umount sem is held already or not. 252 * completion. Caller need not hold sb s_umount semaphore.
263 * 253 *
264 */ 254 */
265void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 255void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
266 long nr_pages, int sb_locked) 256 long nr_pages)
267{ 257{
268 struct wb_writeback_args args = { 258 struct wb_writeback_args args = {
269 .sb = sb, 259 .sb = sb,
270 .sync_mode = WB_SYNC_NONE, 260 .sync_mode = WB_SYNC_NONE,
271 .nr_pages = nr_pages, 261 .nr_pages = nr_pages,
272 .range_cyclic = 1, 262 .range_cyclic = 1,
273 .sb_pinned = sb_locked,
274 }; 263 };
275 264
276 /* 265 /*
@@ -282,7 +271,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
282 args.for_background = 1; 271 args.for_background = 1;
283 } 272 }
284 273
285 bdi_alloc_queue_work(bdi, &args, sb_locked); 274 bdi_alloc_queue_work(bdi, &args);
286} 275}
287 276
288/* 277/*
@@ -595,7 +584,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
595 /* 584 /*
596 * Caller must already hold the ref for this 585 * Caller must already hold the ref for this
597 */ 586 */
598 if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { 587 if (wbc->sync_mode == WB_SYNC_ALL) {
599 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 588 WARN_ON(!rwsem_is_locked(&sb->s_umount));
600 return SB_NOT_PINNED; 589 return SB_NOT_PINNED;
601 } 590 }
@@ -769,7 +758,6 @@ static long wb_writeback(struct bdi_writeback *wb,
769 .for_kupdate = args->for_kupdate, 758 .for_kupdate = args->for_kupdate,
770 .for_background = args->for_background, 759 .for_background = args->for_background,
771 .range_cyclic = args->range_cyclic, 760 .range_cyclic = args->range_cyclic,
772 .sb_pinned = args->sb_pinned,
773 }; 761 };
774 unsigned long oldest_jif; 762 unsigned long oldest_jif;
775 long wrote = 0; 763 long wrote = 0;
@@ -912,7 +900,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
912 900
913 while ((work = get_next_work_item(bdi, wb)) != NULL) { 901 while ((work = get_next_work_item(bdi, wb)) != NULL) {
914 struct wb_writeback_args args = work->args; 902 struct wb_writeback_args args = work->args;
915 int post_clear;
916 903
917 /* 904 /*
918 * Override sync mode, in case we must wait for completion 905 * Override sync mode, in case we must wait for completion
@@ -920,13 +907,11 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
920 if (force_wait) 907 if (force_wait)
921 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 908 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
922 909
923 post_clear = WB_SYNC_ALL || args.sb_pinned;
924
925 /* 910 /*
926 * If this isn't a data integrity operation, just notify 911 * If this isn't a data integrity operation, just notify
927 * that we have seen this work and we are now starting it. 912 * that we have seen this work and we are now starting it.
928 */ 913 */
929 if (!post_clear) 914 if (args.sync_mode == WB_SYNC_NONE)
930 wb_clear_pending(wb, work); 915 wb_clear_pending(wb, work);
931 916
932 wrote += wb_writeback(wb, &args); 917 wrote += wb_writeback(wb, &args);
@@ -935,7 +920,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
935 * This is a data integrity writeback, so only do the 920 * This is a data integrity writeback, so only do the
936 * notification when we have completed the work. 921 * notification when we have completed the work.
937 */ 922 */
938 if (post_clear) 923 if (args.sync_mode == WB_SYNC_ALL)
939 wb_clear_pending(wb, work); 924 wb_clear_pending(wb, work);
940 } 925 }
941 926
@@ -1011,7 +996,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
1011 if (!bdi_has_dirty_io(bdi)) 996 if (!bdi_has_dirty_io(bdi))
1012 continue; 997 continue;
1013 998
1014 bdi_alloc_queue_work(bdi, &args, 0); 999 bdi_alloc_queue_work(bdi, &args);
1015 } 1000 }
1016 1001
1017 rcu_read_unlock(); 1002 rcu_read_unlock();
@@ -1220,18 +1205,6 @@ static void wait_sb_inodes(struct super_block *sb)
1220 iput(old_inode); 1205 iput(old_inode);
1221} 1206}
1222 1207
1223static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1224{
1225 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1226 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1227 long nr_to_write;
1228
1229 nr_to_write = nr_dirty + nr_unstable +
1230 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1231
1232 bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
1233}
1234
1235/** 1208/**
1236 * writeback_inodes_sb - writeback dirty inodes from given super_block 1209 * writeback_inodes_sb - writeback dirty inodes from given super_block
1237 * @sb: the superblock 1210 * @sb: the superblock
@@ -1243,21 +1216,16 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1243 */ 1216 */
1244void writeback_inodes_sb(struct super_block *sb) 1217void writeback_inodes_sb(struct super_block *sb)
1245{ 1218{
1246 __writeback_inodes_sb(sb, 0); 1219 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1247} 1220 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1248EXPORT_SYMBOL(writeback_inodes_sb); 1221 long nr_to_write;
1249 1222
1250/** 1223 nr_to_write = nr_dirty + nr_unstable +
1251 * writeback_inodes_sb_locked - writeback dirty inodes from given super_block 1224 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1252 * @sb: the superblock 1225
1253 * 1226 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
1254 * Like writeback_inodes_sb(), except the caller already holds the
1255 * sb umount sem.
1256 */
1257void writeback_inodes_sb_locked(struct super_block *sb)
1258{
1259 __writeback_inodes_sb(sb, 1);
1260} 1227}
1228EXPORT_SYMBOL(writeback_inodes_sb);
1261 1229
1262/** 1230/**
1263 * writeback_inodes_sb_if_idle - start writeback if none underway 1231 * writeback_inodes_sb_if_idle - start writeback if none underway
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
103 /* banners (can't represent line 0 by pos 0 as that would involve 103 /* banners (can't represent line 0 by pos 0 as that would involve
104 * returning a NULL pointer) */ 104 * returning a NULL pointer) */
105 if (pos == 0) 105 if (pos == 0)
106 return (struct fscache_object *) ++(*_pos); 106 return (struct fscache_object *)(long)++(*_pos);
107 if (pos < 3) 107 if (pos < 3)
108 return (struct fscache_object *)pos; 108 return (struct fscache_object *)pos;
109 109
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 47aefd376e54..723b889fd219 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -710,30 +710,26 @@ static void fscache_write_op(struct fscache_operation *_op)
710 goto superseded; 710 goto superseded;
711 } 711 }
712 712
713 if (page) { 713 radix_tree_tag_set(&cookie->stores, page->index,
714 radix_tree_tag_set(&cookie->stores, page->index, 714 FSCACHE_COOKIE_STORING_TAG);
715 FSCACHE_COOKIE_STORING_TAG); 715 radix_tree_tag_clear(&cookie->stores, page->index,
716 radix_tree_tag_clear(&cookie->stores, page->index, 716 FSCACHE_COOKIE_PENDING_TAG);
717 FSCACHE_COOKIE_PENDING_TAG);
718 }
719 717
720 spin_unlock(&cookie->stores_lock); 718 spin_unlock(&cookie->stores_lock);
721 spin_unlock(&object->lock); 719 spin_unlock(&object->lock);
722 720
723 if (page) { 721 fscache_set_op_state(&op->op, "Store");
724 fscache_set_op_state(&op->op, "Store"); 722 fscache_stat(&fscache_n_store_pages);
725 fscache_stat(&fscache_n_store_pages); 723 fscache_stat(&fscache_n_cop_write_page);
726 fscache_stat(&fscache_n_cop_write_page); 724 ret = object->cache->ops->write_page(op, page);
727 ret = object->cache->ops->write_page(op, page); 725 fscache_stat_d(&fscache_n_cop_write_page);
728 fscache_stat_d(&fscache_n_cop_write_page); 726 fscache_set_op_state(&op->op, "EndWrite");
729 fscache_set_op_state(&op->op, "EndWrite"); 727 fscache_end_page_write(object, page);
730 fscache_end_page_write(object, page); 728 if (ret < 0) {
731 if (ret < 0) { 729 fscache_set_op_state(&op->op, "Abort");
732 fscache_set_op_state(&op->op, "Abort"); 730 fscache_abort_object(object);
733 fscache_abort_object(object); 731 } else {
734 } else { 732 fscache_enqueue_operation(&op->op);
735 fscache_enqueue_operation(&op->op);
736 }
737 } 733 }
738 734
739 _leave(""); 735 _leave("");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e53df5ebb2b8..9424796d6634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,6 +16,9 @@
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/file.h> 17#include <linux/file.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/pipe_fs_i.h>
20#include <linux/swap.h>
21#include <linux/splice.h>
19 22
20MODULE_ALIAS_MISCDEV(FUSE_MINOR); 23MODULE_ALIAS_MISCDEV(FUSE_MINOR);
21MODULE_ALIAS("devname:fuse"); 24MODULE_ALIAS("devname:fuse");
@@ -499,6 +502,9 @@ struct fuse_copy_state {
499 int write; 502 int write;
500 struct fuse_req *req; 503 struct fuse_req *req;
501 const struct iovec *iov; 504 const struct iovec *iov;
505 struct pipe_buffer *pipebufs;
506 struct pipe_buffer *currbuf;
507 struct pipe_inode_info *pipe;
502 unsigned long nr_segs; 508 unsigned long nr_segs;
503 unsigned long seglen; 509 unsigned long seglen;
504 unsigned long addr; 510 unsigned long addr;
@@ -506,16 +512,16 @@ struct fuse_copy_state {
506 void *mapaddr; 512 void *mapaddr;
507 void *buf; 513 void *buf;
508 unsigned len; 514 unsigned len;
515 unsigned move_pages:1;
509}; 516};
510 517
511static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, 518static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
512 int write, struct fuse_req *req, 519 int write,
513 const struct iovec *iov, unsigned long nr_segs) 520 const struct iovec *iov, unsigned long nr_segs)
514{ 521{
515 memset(cs, 0, sizeof(*cs)); 522 memset(cs, 0, sizeof(*cs));
516 cs->fc = fc; 523 cs->fc = fc;
517 cs->write = write; 524 cs->write = write;
518 cs->req = req;
519 cs->iov = iov; 525 cs->iov = iov;
520 cs->nr_segs = nr_segs; 526 cs->nr_segs = nr_segs;
521} 527}
@@ -523,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
523/* Unmap and put previous page of userspace buffer */ 529/* Unmap and put previous page of userspace buffer */
524static void fuse_copy_finish(struct fuse_copy_state *cs) 530static void fuse_copy_finish(struct fuse_copy_state *cs)
525{ 531{
526 if (cs->mapaddr) { 532 if (cs->currbuf) {
533 struct pipe_buffer *buf = cs->currbuf;
534
535 if (!cs->write) {
536 buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
537 } else {
538 kunmap_atomic(cs->mapaddr, KM_USER0);
539 buf->len = PAGE_SIZE - cs->len;
540 }
541 cs->currbuf = NULL;
542 cs->mapaddr = NULL;
543 } else if (cs->mapaddr) {
527 kunmap_atomic(cs->mapaddr, KM_USER0); 544 kunmap_atomic(cs->mapaddr, KM_USER0);
528 if (cs->write) { 545 if (cs->write) {
529 flush_dcache_page(cs->pg); 546 flush_dcache_page(cs->pg);
@@ -545,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
545 562
546 unlock_request(cs->fc, cs->req); 563 unlock_request(cs->fc, cs->req);
547 fuse_copy_finish(cs); 564 fuse_copy_finish(cs);
548 if (!cs->seglen) { 565 if (cs->pipebufs) {
549 BUG_ON(!cs->nr_segs); 566 struct pipe_buffer *buf = cs->pipebufs;
550 cs->seglen = cs->iov[0].iov_len; 567
551 cs->addr = (unsigned long) cs->iov[0].iov_base; 568 if (!cs->write) {
552 cs->iov++; 569 err = buf->ops->confirm(cs->pipe, buf);
553 cs->nr_segs--; 570 if (err)
571 return err;
572
573 BUG_ON(!cs->nr_segs);
574 cs->currbuf = buf;
575 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
576 cs->len = buf->len;
577 cs->buf = cs->mapaddr + buf->offset;
578 cs->pipebufs++;
579 cs->nr_segs--;
580 } else {
581 struct page *page;
582
583 if (cs->nr_segs == cs->pipe->buffers)
584 return -EIO;
585
586 page = alloc_page(GFP_HIGHUSER);
587 if (!page)
588 return -ENOMEM;
589
590 buf->page = page;
591 buf->offset = 0;
592 buf->len = 0;
593
594 cs->currbuf = buf;
595 cs->mapaddr = kmap_atomic(page, KM_USER0);
596 cs->buf = cs->mapaddr;
597 cs->len = PAGE_SIZE;
598 cs->pipebufs++;
599 cs->nr_segs++;
600 }
601 } else {
602 if (!cs->seglen) {
603 BUG_ON(!cs->nr_segs);
604 cs->seglen = cs->iov[0].iov_len;
605 cs->addr = (unsigned long) cs->iov[0].iov_base;
606 cs->iov++;
607 cs->nr_segs--;
608 }
609 err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
610 if (err < 0)
611 return err;
612 BUG_ON(err != 1);
613 offset = cs->addr % PAGE_SIZE;
614 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
615 cs->buf = cs->mapaddr + offset;
616 cs->len = min(PAGE_SIZE - offset, cs->seglen);
617 cs->seglen -= cs->len;
618 cs->addr += cs->len;
554 } 619 }
555 down_read(&current->mm->mmap_sem);
556 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
557 &cs->pg, NULL);
558 up_read(&current->mm->mmap_sem);
559 if (err < 0)
560 return err;
561 BUG_ON(err != 1);
562 offset = cs->addr % PAGE_SIZE;
563 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
564 cs->buf = cs->mapaddr + offset;
565 cs->len = min(PAGE_SIZE - offset, cs->seglen);
566 cs->seglen -= cs->len;
567 cs->addr += cs->len;
568 620
569 return lock_request(cs->fc, cs->req); 621 return lock_request(cs->fc, cs->req);
570} 622}
@@ -586,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
586 return ncpy; 638 return ncpy;
587} 639}
588 640
641static int fuse_check_page(struct page *page)
642{
643 if (page_mapcount(page) ||
644 page->mapping != NULL ||
645 page_count(page) != 1 ||
646 (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
647 ~(1 << PG_locked |
648 1 << PG_referenced |
649 1 << PG_uptodate |
650 1 << PG_lru |
651 1 << PG_active |
652 1 << PG_reclaim))) {
653 printk(KERN_WARNING "fuse: trying to steal weird page\n");
654 printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
655 return 1;
656 }
657 return 0;
658}
659
660static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
661{
662 int err;
663 struct page *oldpage = *pagep;
664 struct page *newpage;
665 struct pipe_buffer *buf = cs->pipebufs;
666 struct address_space *mapping;
667 pgoff_t index;
668
669 unlock_request(cs->fc, cs->req);
670 fuse_copy_finish(cs);
671
672 err = buf->ops->confirm(cs->pipe, buf);
673 if (err)
674 return err;
675
676 BUG_ON(!cs->nr_segs);
677 cs->currbuf = buf;
678 cs->len = buf->len;
679 cs->pipebufs++;
680 cs->nr_segs--;
681
682 if (cs->len != PAGE_SIZE)
683 goto out_fallback;
684
685 if (buf->ops->steal(cs->pipe, buf) != 0)
686 goto out_fallback;
687
688 newpage = buf->page;
689
690 if (WARN_ON(!PageUptodate(newpage)))
691 return -EIO;
692
693 ClearPageMappedToDisk(newpage);
694
695 if (fuse_check_page(newpage) != 0)
696 goto out_fallback_unlock;
697
698 mapping = oldpage->mapping;
699 index = oldpage->index;
700
701 /*
702 * This is a new and locked page, it shouldn't be mapped or
703 * have any special flags on it
704 */
705 if (WARN_ON(page_mapped(oldpage)))
706 goto out_fallback_unlock;
707 if (WARN_ON(page_has_private(oldpage)))
708 goto out_fallback_unlock;
709 if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
710 goto out_fallback_unlock;
711 if (WARN_ON(PageMlocked(oldpage)))
712 goto out_fallback_unlock;
713
714 remove_from_page_cache(oldpage);
715 page_cache_release(oldpage);
716
717 err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
718 if (err) {
719 printk(KERN_WARNING "fuse_try_move_page: failed to add page");
720 goto out_fallback_unlock;
721 }
722 page_cache_get(newpage);
723
724 if (!(buf->flags & PIPE_BUF_FLAG_LRU))
725 lru_cache_add_file(newpage);
726
727 err = 0;
728 spin_lock(&cs->fc->lock);
729 if (cs->req->aborted)
730 err = -ENOENT;
731 else
732 *pagep = newpage;
733 spin_unlock(&cs->fc->lock);
734
735 if (err) {
736 unlock_page(newpage);
737 page_cache_release(newpage);
738 return err;
739 }
740
741 unlock_page(oldpage);
742 page_cache_release(oldpage);
743 cs->len = 0;
744
745 return 0;
746
747out_fallback_unlock:
748 unlock_page(newpage);
749out_fallback:
750 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
751 cs->buf = cs->mapaddr + buf->offset;
752
753 err = lock_request(cs->fc, cs->req);
754 if (err)
755 return err;
756
757 return 1;
758}
759
760static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
761 unsigned offset, unsigned count)
762{
763 struct pipe_buffer *buf;
764
765 if (cs->nr_segs == cs->pipe->buffers)
766 return -EIO;
767
768 unlock_request(cs->fc, cs->req);
769 fuse_copy_finish(cs);
770
771 buf = cs->pipebufs;
772 page_cache_get(page);
773 buf->page = page;
774 buf->offset = offset;
775 buf->len = count;
776
777 cs->pipebufs++;
778 cs->nr_segs++;
779 cs->len = 0;
780
781 return 0;
782}
783
589/* 784/*
590 * Copy a page in the request to/from the userspace buffer. Must be 785 * Copy a page in the request to/from the userspace buffer. Must be
591 * done atomically 786 * done atomically
592 */ 787 */
593static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, 788static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
594 unsigned offset, unsigned count, int zeroing) 789 unsigned offset, unsigned count, int zeroing)
595{ 790{
791 int err;
792 struct page *page = *pagep;
793
596 if (page && zeroing && count < PAGE_SIZE) { 794 if (page && zeroing && count < PAGE_SIZE) {
597 void *mapaddr = kmap_atomic(page, KM_USER1); 795 void *mapaddr = kmap_atomic(page, KM_USER1);
598 memset(mapaddr, 0, PAGE_SIZE); 796 memset(mapaddr, 0, PAGE_SIZE);
599 kunmap_atomic(mapaddr, KM_USER1); 797 kunmap_atomic(mapaddr, KM_USER1);
600 } 798 }
601 while (count) { 799 while (count) {
602 if (!cs->len) { 800 if (cs->write && cs->pipebufs && page) {
603 int err = fuse_copy_fill(cs); 801 return fuse_ref_page(cs, page, offset, count);
604 if (err) 802 } else if (!cs->len) {
605 return err; 803 if (cs->move_pages && page &&
804 offset == 0 && count == PAGE_SIZE) {
805 err = fuse_try_move_page(cs, pagep);
806 if (err <= 0)
807 return err;
808 } else {
809 err = fuse_copy_fill(cs);
810 if (err)
811 return err;
812 }
606 } 813 }
607 if (page) { 814 if (page) {
608 void *mapaddr = kmap_atomic(page, KM_USER1); 815 void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -627,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
627 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); 834 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
628 835
629 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { 836 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
630 struct page *page = req->pages[i]; 837 int err;
631 int err = fuse_copy_page(cs, page, offset, count, zeroing); 838
839 err = fuse_copy_page(cs, &req->pages[i], offset, count,
840 zeroing);
632 if (err) 841 if (err)
633 return err; 842 return err;
634 843
@@ -705,11 +914,10 @@ __acquires(&fc->lock)
705 * 914 *
706 * Called with fc->lock held, releases it 915 * Called with fc->lock held, releases it
707 */ 916 */
708static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, 917static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
709 const struct iovec *iov, unsigned long nr_segs) 918 size_t nbytes, struct fuse_req *req)
710__releases(&fc->lock) 919__releases(&fc->lock)
711{ 920{
712 struct fuse_copy_state cs;
713 struct fuse_in_header ih; 921 struct fuse_in_header ih;
714 struct fuse_interrupt_in arg; 922 struct fuse_interrupt_in arg;
715 unsigned reqsize = sizeof(ih) + sizeof(arg); 923 unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -725,14 +933,13 @@ __releases(&fc->lock)
725 arg.unique = req->in.h.unique; 933 arg.unique = req->in.h.unique;
726 934
727 spin_unlock(&fc->lock); 935 spin_unlock(&fc->lock);
728 if (iov_length(iov, nr_segs) < reqsize) 936 if (nbytes < reqsize)
729 return -EINVAL; 937 return -EINVAL;
730 938
731 fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs); 939 err = fuse_copy_one(cs, &ih, sizeof(ih));
732 err = fuse_copy_one(&cs, &ih, sizeof(ih));
733 if (!err) 940 if (!err)
734 err = fuse_copy_one(&cs, &arg, sizeof(arg)); 941 err = fuse_copy_one(cs, &arg, sizeof(arg));
735 fuse_copy_finish(&cs); 942 fuse_copy_finish(cs);
736 943
737 return err ? err : reqsize; 944 return err ? err : reqsize;
738} 945}
@@ -746,18 +953,13 @@ __releases(&fc->lock)
746 * request_end(). Otherwise add it to the processing list, and set 953 * request_end(). Otherwise add it to the processing list, and set
747 * the 'sent' flag. 954 * the 'sent' flag.
748 */ 955 */
749static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, 956static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
750 unsigned long nr_segs, loff_t pos) 957 struct fuse_copy_state *cs, size_t nbytes)
751{ 958{
752 int err; 959 int err;
753 struct fuse_req *req; 960 struct fuse_req *req;
754 struct fuse_in *in; 961 struct fuse_in *in;
755 struct fuse_copy_state cs;
756 unsigned reqsize; 962 unsigned reqsize;
757 struct file *file = iocb->ki_filp;
758 struct fuse_conn *fc = fuse_get_conn(file);
759 if (!fc)
760 return -EPERM;
761 963
762 restart: 964 restart:
763 spin_lock(&fc->lock); 965 spin_lock(&fc->lock);
@@ -777,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
777 if (!list_empty(&fc->interrupts)) { 979 if (!list_empty(&fc->interrupts)) {
778 req = list_entry(fc->interrupts.next, struct fuse_req, 980 req = list_entry(fc->interrupts.next, struct fuse_req,
779 intr_entry); 981 intr_entry);
780 return fuse_read_interrupt(fc, req, iov, nr_segs); 982 return fuse_read_interrupt(fc, cs, nbytes, req);
781 } 983 }
782 984
783 req = list_entry(fc->pending.next, struct fuse_req, list); 985 req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -787,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
787 in = &req->in; 989 in = &req->in;
788 reqsize = in->h.len; 990 reqsize = in->h.len;
789 /* If request is too large, reply with an error and restart the read */ 991 /* If request is too large, reply with an error and restart the read */
790 if (iov_length(iov, nr_segs) < reqsize) { 992 if (nbytes < reqsize) {
791 req->out.h.error = -EIO; 993 req->out.h.error = -EIO;
792 /* SETXATTR is special, since it may contain too large data */ 994 /* SETXATTR is special, since it may contain too large data */
793 if (in->h.opcode == FUSE_SETXATTR) 995 if (in->h.opcode == FUSE_SETXATTR)
@@ -796,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
796 goto restart; 998 goto restart;
797 } 999 }
798 spin_unlock(&fc->lock); 1000 spin_unlock(&fc->lock);
799 fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); 1001 cs->req = req;
800 err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); 1002 err = fuse_copy_one(cs, &in->h, sizeof(in->h));
801 if (!err) 1003 if (!err)
802 err = fuse_copy_args(&cs, in->numargs, in->argpages, 1004 err = fuse_copy_args(cs, in->numargs, in->argpages,
803 (struct fuse_arg *) in->args, 0); 1005 (struct fuse_arg *) in->args, 0);
804 fuse_copy_finish(&cs); 1006 fuse_copy_finish(cs);
805 spin_lock(&fc->lock); 1007 spin_lock(&fc->lock);
806 req->locked = 0; 1008 req->locked = 0;
807 if (req->aborted) { 1009 if (req->aborted) {
@@ -829,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
829 return err; 1031 return err;
830} 1032}
831 1033
1034static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1035 unsigned long nr_segs, loff_t pos)
1036{
1037 struct fuse_copy_state cs;
1038 struct file *file = iocb->ki_filp;
1039 struct fuse_conn *fc = fuse_get_conn(file);
1040 if (!fc)
1041 return -EPERM;
1042
1043 fuse_copy_init(&cs, fc, 1, iov, nr_segs);
1044
1045 return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
1046}
1047
1048static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
1049 struct pipe_buffer *buf)
1050{
1051 return 1;
1052}
1053
1054static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
1055 .can_merge = 0,
1056 .map = generic_pipe_buf_map,
1057 .unmap = generic_pipe_buf_unmap,
1058 .confirm = generic_pipe_buf_confirm,
1059 .release = generic_pipe_buf_release,
1060 .steal = fuse_dev_pipe_buf_steal,
1061 .get = generic_pipe_buf_get,
1062};
1063
1064static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1065 struct pipe_inode_info *pipe,
1066 size_t len, unsigned int flags)
1067{
1068 int ret;
1069 int page_nr = 0;
1070 int do_wakeup = 0;
1071 struct pipe_buffer *bufs;
1072 struct fuse_copy_state cs;
1073 struct fuse_conn *fc = fuse_get_conn(in);
1074 if (!fc)
1075 return -EPERM;
1076
1077 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1078 if (!bufs)
1079 return -ENOMEM;
1080
1081 fuse_copy_init(&cs, fc, 1, NULL, 0);
1082 cs.pipebufs = bufs;
1083 cs.pipe = pipe;
1084 ret = fuse_dev_do_read(fc, in, &cs, len);
1085 if (ret < 0)
1086 goto out;
1087
1088 ret = 0;
1089 pipe_lock(pipe);
1090
1091 if (!pipe->readers) {
1092 send_sig(SIGPIPE, current, 0);
1093 if (!ret)
1094 ret = -EPIPE;
1095 goto out_unlock;
1096 }
1097
1098 if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
1099 ret = -EIO;
1100 goto out_unlock;
1101 }
1102
1103 while (page_nr < cs.nr_segs) {
1104 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
1105 struct pipe_buffer *buf = pipe->bufs + newbuf;
1106
1107 buf->page = bufs[page_nr].page;
1108 buf->offset = bufs[page_nr].offset;
1109 buf->len = bufs[page_nr].len;
1110 buf->ops = &fuse_dev_pipe_buf_ops;
1111
1112 pipe->nrbufs++;
1113 page_nr++;
1114 ret += buf->len;
1115
1116 if (pipe->inode)
1117 do_wakeup = 1;
1118 }
1119
1120out_unlock:
1121 pipe_unlock(pipe);
1122
1123 if (do_wakeup) {
1124 smp_mb();
1125 if (waitqueue_active(&pipe->wait))
1126 wake_up_interruptible(&pipe->wait);
1127 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1128 }
1129
1130out:
1131 for (; page_nr < cs.nr_segs; page_nr++)
1132 page_cache_release(bufs[page_nr].page);
1133
1134 kfree(bufs);
1135 return ret;
1136}
1137
832static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, 1138static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
833 struct fuse_copy_state *cs) 1139 struct fuse_copy_state *cs)
834{ 1140{
@@ -988,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
988 * it from the list and copy the rest of the buffer to the request. 1294 * it from the list and copy the rest of the buffer to the request.
989 * The request is finished by calling request_end() 1295 * The request is finished by calling request_end()
990 */ 1296 */
991static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, 1297static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
992 unsigned long nr_segs, loff_t pos) 1298 struct fuse_copy_state *cs, size_t nbytes)
993{ 1299{
994 int err; 1300 int err;
995 size_t nbytes = iov_length(iov, nr_segs);
996 struct fuse_req *req; 1301 struct fuse_req *req;
997 struct fuse_out_header oh; 1302 struct fuse_out_header oh;
998 struct fuse_copy_state cs;
999 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
1000 if (!fc)
1001 return -EPERM;
1002 1303
1003 fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
1004 if (nbytes < sizeof(struct fuse_out_header)) 1304 if (nbytes < sizeof(struct fuse_out_header))
1005 return -EINVAL; 1305 return -EINVAL;
1006 1306
1007 err = fuse_copy_one(&cs, &oh, sizeof(oh)); 1307 err = fuse_copy_one(cs, &oh, sizeof(oh));
1008 if (err) 1308 if (err)
1009 goto err_finish; 1309 goto err_finish;
1010 1310
@@ -1017,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1017 * and error contains notification code. 1317 * and error contains notification code.
1018 */ 1318 */
1019 if (!oh.unique) { 1319 if (!oh.unique) {
1020 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); 1320 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
1021 return err ? err : nbytes; 1321 return err ? err : nbytes;
1022 } 1322 }
1023 1323
@@ -1036,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1036 1336
1037 if (req->aborted) { 1337 if (req->aborted) {
1038 spin_unlock(&fc->lock); 1338 spin_unlock(&fc->lock);
1039 fuse_copy_finish(&cs); 1339 fuse_copy_finish(cs);
1040 spin_lock(&fc->lock); 1340 spin_lock(&fc->lock);
1041 request_end(fc, req); 1341 request_end(fc, req);
1042 return -ENOENT; 1342 return -ENOENT;
@@ -1053,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1053 queue_interrupt(fc, req); 1353 queue_interrupt(fc, req);
1054 1354
1055 spin_unlock(&fc->lock); 1355 spin_unlock(&fc->lock);
1056 fuse_copy_finish(&cs); 1356 fuse_copy_finish(cs);
1057 return nbytes; 1357 return nbytes;
1058 } 1358 }
1059 1359
@@ -1061,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1061 list_move(&req->list, &fc->io); 1361 list_move(&req->list, &fc->io);
1062 req->out.h = oh; 1362 req->out.h = oh;
1063 req->locked = 1; 1363 req->locked = 1;
1064 cs.req = req; 1364 cs->req = req;
1365 if (!req->out.page_replace)
1366 cs->move_pages = 0;
1065 spin_unlock(&fc->lock); 1367 spin_unlock(&fc->lock);
1066 1368
1067 err = copy_out_args(&cs, &req->out, nbytes); 1369 err = copy_out_args(cs, &req->out, nbytes);
1068 fuse_copy_finish(&cs); 1370 fuse_copy_finish(cs);
1069 1371
1070 spin_lock(&fc->lock); 1372 spin_lock(&fc->lock);
1071 req->locked = 0; 1373 req->locked = 0;
@@ -1081,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1081 err_unlock: 1383 err_unlock:
1082 spin_unlock(&fc->lock); 1384 spin_unlock(&fc->lock);
1083 err_finish: 1385 err_finish:
1084 fuse_copy_finish(&cs); 1386 fuse_copy_finish(cs);
1085 return err; 1387 return err;
1086} 1388}
1087 1389
1390static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1391 unsigned long nr_segs, loff_t pos)
1392{
1393 struct fuse_copy_state cs;
1394 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
1395 if (!fc)
1396 return -EPERM;
1397
1398 fuse_copy_init(&cs, fc, 0, iov, nr_segs);
1399
1400 return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
1401}
1402
1403static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1404 struct file *out, loff_t *ppos,
1405 size_t len, unsigned int flags)
1406{
1407 unsigned nbuf;
1408 unsigned idx;
1409 struct pipe_buffer *bufs;
1410 struct fuse_copy_state cs;
1411 struct fuse_conn *fc;
1412 size_t rem;
1413 ssize_t ret;
1414
1415 fc = fuse_get_conn(out);
1416 if (!fc)
1417 return -EPERM;
1418
1419 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1420 if (!bufs)
1421 return -ENOMEM;
1422
1423 pipe_lock(pipe);
1424 nbuf = 0;
1425 rem = 0;
1426 for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
1427 rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
1428
1429 ret = -EINVAL;
1430 if (rem < len) {
1431 pipe_unlock(pipe);
1432 goto out;
1433 }
1434
1435 rem = len;
1436 while (rem) {
1437 struct pipe_buffer *ibuf;
1438 struct pipe_buffer *obuf;
1439
1440 BUG_ON(nbuf >= pipe->buffers);
1441 BUG_ON(!pipe->nrbufs);
1442 ibuf = &pipe->bufs[pipe->curbuf];
1443 obuf = &bufs[nbuf];
1444
1445 if (rem >= ibuf->len) {
1446 *obuf = *ibuf;
1447 ibuf->ops = NULL;
1448 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
1449 pipe->nrbufs--;
1450 } else {
1451 ibuf->ops->get(pipe, ibuf);
1452 *obuf = *ibuf;
1453 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1454 obuf->len = rem;
1455 ibuf->offset += obuf->len;
1456 ibuf->len -= obuf->len;
1457 }
1458 nbuf++;
1459 rem -= obuf->len;
1460 }
1461 pipe_unlock(pipe);
1462
1463 fuse_copy_init(&cs, fc, 0, NULL, nbuf);
1464 cs.pipebufs = bufs;
1465 cs.pipe = pipe;
1466
1467 if (flags & SPLICE_F_MOVE)
1468 cs.move_pages = 1;
1469
1470 ret = fuse_dev_do_write(fc, &cs, len);
1471
1472 for (idx = 0; idx < nbuf; idx++) {
1473 struct pipe_buffer *buf = &bufs[idx];
1474 buf->ops->release(pipe, buf);
1475 }
1476out:
1477 kfree(bufs);
1478 return ret;
1479}
1480
1088static unsigned fuse_dev_poll(struct file *file, poll_table *wait) 1481static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1089{ 1482{
1090 unsigned mask = POLLOUT | POLLWRNORM; 1483 unsigned mask = POLLOUT | POLLWRNORM;
@@ -1226,8 +1619,10 @@ const struct file_operations fuse_dev_operations = {
1226 .llseek = no_llseek, 1619 .llseek = no_llseek,
1227 .read = do_sync_read, 1620 .read = do_sync_read,
1228 .aio_read = fuse_dev_read, 1621 .aio_read = fuse_dev_read,
1622 .splice_read = fuse_dev_splice_read,
1229 .write = do_sync_write, 1623 .write = do_sync_write,
1230 .aio_write = fuse_dev_write, 1624 .aio_write = fuse_dev_write,
1625 .splice_write = fuse_dev_splice_write,
1231 .poll = fuse_dev_poll, 1626 .poll = fuse_dev_poll,
1232 .release = fuse_dev_release, 1627 .release = fuse_dev_release,
1233 .fasync = fuse_dev_fasync, 1628 .fasync = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..3cdc5f78a406 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
1156 return 0; 1156 return 0;
1157} 1157}
1158 1158
1159static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) 1159static int fuse_dir_fsync(struct file *file, int datasync)
1160{ 1160{
1161 /* nfsd can call this with no file */ 1161 return fuse_fsync_common(file, datasync, 1);
1162 return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
1163} 1162}
1164 1163
1165static bool update_mtime(unsigned ivalid) 1164static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..ada0adeb3bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
351 fuse_release_nowrite(inode); 351 fuse_release_nowrite(inode);
352} 352}
353 353
354int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 354int fuse_fsync_common(struct file *file, int datasync, int isdir)
355 int isdir)
356{ 355{
357 struct inode *inode = de->d_inode; 356 struct inode *inode = file->f_mapping->host;
358 struct fuse_conn *fc = get_fuse_conn(inode); 357 struct fuse_conn *fc = get_fuse_conn(inode);
359 struct fuse_file *ff = file->private_data; 358 struct fuse_file *ff = file->private_data;
360 struct fuse_req *req; 359 struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
403 return err; 402 return err;
404} 403}
405 404
406static int fuse_fsync(struct file *file, struct dentry *de, int datasync) 405static int fuse_fsync(struct file *file, int datasync)
407{ 406{
408 return fuse_fsync_common(file, de, datasync, 0); 407 return fuse_fsync_common(file, datasync, 0);
409} 408}
410 409
411void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, 410void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
517 int i; 516 int i;
518 size_t count = req->misc.read.in.size; 517 size_t count = req->misc.read.in.size;
519 size_t num_read = req->out.args[0].size; 518 size_t num_read = req->out.args[0].size;
520 struct inode *inode = req->pages[0]->mapping->host; 519 struct address_space *mapping = NULL;
521 520
522 /* 521 for (i = 0; mapping == NULL && i < req->num_pages; i++)
523 * Short read means EOF. If file size is larger, truncate it 522 mapping = req->pages[i]->mapping;
524 */
525 if (!req->out.h.error && num_read < count) {
526 loff_t pos = page_offset(req->pages[0]) + num_read;
527 fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
528 }
529 523
530 fuse_invalidate_attr(inode); /* atime changed */ 524 if (mapping) {
525 struct inode *inode = mapping->host;
526
527 /*
528 * Short read means EOF. If file size is larger, truncate it
529 */
530 if (!req->out.h.error && num_read < count) {
531 loff_t pos;
532
533 pos = page_offset(req->pages[0]) + num_read;
534 fuse_read_update_size(inode, pos,
535 req->misc.read.attr_ver);
536 }
537 fuse_invalidate_attr(inode); /* atime changed */
538 }
531 539
532 for (i = 0; i < req->num_pages; i++) { 540 for (i = 0; i < req->num_pages; i++) {
533 struct page *page = req->pages[i]; 541 struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
536 else 544 else
537 SetPageError(page); 545 SetPageError(page);
538 unlock_page(page); 546 unlock_page(page);
547 page_cache_release(page);
539 } 548 }
540 if (req->ff) 549 if (req->ff)
541 fuse_file_put(req->ff); 550 fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
550 559
551 req->out.argpages = 1; 560 req->out.argpages = 1;
552 req->out.page_zeroing = 1; 561 req->out.page_zeroing = 1;
562 req->out.page_replace = 1;
553 fuse_read_fill(req, file, pos, count, FUSE_READ); 563 fuse_read_fill(req, file, pos, count, FUSE_READ);
554 req->misc.read.attr_ver = fuse_get_attr_version(fc); 564 req->misc.read.attr_ver = fuse_get_attr_version(fc);
555 if (fc->async_read) { 565 if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
589 return PTR_ERR(req); 599 return PTR_ERR(req);
590 } 600 }
591 } 601 }
602 page_cache_get(page);
592 req->pages[req->num_pages] = page; 603 req->pages[req->num_pages] = page;
593 req->num_pages++; 604 req->num_pages++;
594 return 0; 605 return 0;
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
994 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 1005 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
995 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 1006 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
996 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 1007 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
997 down_read(&current->mm->mmap_sem); 1008 npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
998 npages = get_user_pages(current, current->mm, user_addr, npages, !write,
999 0, req->pages, NULL);
1000 up_read(&current->mm->mmap_sem);
1001 if (npages < 0) 1009 if (npages < 0)
1002 return npages; 1010 return npages;
1003 1011
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1580 while (iov_iter_count(&ii)) { 1588 while (iov_iter_count(&ii)) {
1581 struct page *page = pages[page_idx++]; 1589 struct page *page = pages[page_idx++];
1582 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); 1590 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1583 void *kaddr, *map; 1591 void *kaddr;
1584 1592
1585 kaddr = map = kmap(page); 1593 kaddr = kmap(page);
1586 1594
1587 while (todo) { 1595 while (todo) {
1588 char __user *uaddr = ii.iov->iov_base + ii.iov_offset; 1596 char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..8f309f04064e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
177 /** Zero partially or not copied pages */ 177 /** Zero partially or not copied pages */
178 unsigned page_zeroing:1; 178 unsigned page_zeroing:1;
179 179
180 /** Pages may be replaced with new ones */
181 unsigned page_replace:1;
182
180 /** Number or arguments */ 183 /** Number or arguments */
181 unsigned numargs; 184 unsigned numargs;
182 185
@@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode);
568/** 571/**
569 * Send FSYNC or FSYNCDIR request 572 * Send FSYNC or FSYNCDIR request
570 */ 573 */
571int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 574int fuse_fsync_common(struct file *file, int datasync, int isdir);
572 int isdir);
573 575
574/** 576/**
575 * Notify poll wakeup 577 * Notify poll wakeup
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a739a0a48067..9f8b52500d63 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -700,8 +700,14 @@ out:
700 return 0; 700 return 0;
701 701
702 page_cache_release(page); 702 page_cache_release(page);
703
704 /*
705 * XXX(hch): the call below should probably be replaced with
706 * a call to the gfs2-specific truncate blocks helper to actually
707 * release disk blocks..
708 */
703 if (pos + len > ip->i_inode.i_size) 709 if (pos + len > ip->i_inode.i_size)
704 vmtruncate(&ip->i_inode, ip->i_inode.i_size); 710 simple_setsize(&ip->i_inode, ip->i_inode.i_size);
705out_endtrans: 711out_endtrans:
706 gfs2_trans_end(sdp); 712 gfs2_trans_end(sdp);
707out_trans_fail: 713out_trans_fail:
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b20bfcc9fa2d..ed9a94f0ef15 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -554,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
554 * Returns: errno 554 * Returns: errno
555 */ 555 */
556 556
557static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) 557static int gfs2_fsync(struct file *file, int datasync)
558{ 558{
559 struct inode *inode = dentry->d_inode; 559 struct inode *inode = file->f_mapping->host;
560 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); 560 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
561 int ret = 0; 561 int ret = 0;
562 562
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..98cdd05f3316 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1071 return error;
1072} 1072}
1073 1073
1074/*
1075 * XXX: should be changed to have proper ordering by opencoding simple_setsize
1076 */
1074static int setattr_size(struct inode *inode, struct iattr *attr) 1077static int setattr_size(struct inode *inode, struct iattr *attr)
1075{ 1078{
1076 struct gfs2_inode *ip = GFS2_I(inode); 1079 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
1081 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1082 if (error) 1085 if (error)
1083 return error; 1086 return error;
1084 error = vmtruncate(inode, attr->ia_size); 1087 error = simple_setsize(inode, attr->ia_size);
1085 gfs2_trans_end(sdp); 1088 gfs2_trans_end(sdp);
1086 if (error) 1089 if (error)
1087 return error; 1090 return error;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8f4cf1..87ac1891a185 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -411,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
411 return 0; 411 return 0;
412} 412}
413 413
414int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) 414int hostfs_fsync(struct file *file, int datasync)
415{ 415{
416 return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync); 416 return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
417} 417}
418 418
419static const struct file_operations hostfs_file_fops = { 419static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..a9ae9bfa752f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
19 return 0; 19 return 0;
20} 20}
21 21
22int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 22int hpfs_file_fsync(struct file *file, int datasync)
23{ 23{
24 /*return file_fsync(file, dentry);*/ 24 /*return file_fsync(file, datasync);*/
25 return 0; /* Don't fsync :-) */ 25 return 0; /* Don't fsync :-) */
26} 26}
27 27
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..75f9d4324851 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
268 268
269/* file.c */ 269/* file.c */
270 270
271int hpfs_file_fsync(struct file *, struct dentry *, int); 271int hpfs_file_fsync(struct file *, int);
272extern const struct file_operations hpfs_file_ops; 272extern const struct file_operations hpfs_file_ops;
273extern const struct inode_operations hpfs_file_iops; 273extern const struct inode_operations hpfs_file_iops;
274extern const struct address_space_operations hpfs_aops; 274extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..826c3f9d29ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
587 return err; 587 return err;
588} 588}
589 589
590static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) 590static int hppfs_fsync(struct file *file, int datasync)
591{ 591{
592 return 0; 592 return 0;
593} 593}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..a4e9a7ec3691 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,7 +688,7 @@ static void init_once(void *foo)
688const struct file_operations hugetlbfs_file_operations = { 688const struct file_operations hugetlbfs_file_operations = {
689 .read = hugetlbfs_read, 689 .read = hugetlbfs_read,
690 .mmap = hugetlbfs_file_mmap, 690 .mmap = hugetlbfs_file_mmap,
691 .fsync = simple_sync_file, 691 .fsync = noop_fsync,
692 .get_unmapped_area = hugetlb_get_unmapped_area, 692 .get_unmapped_area = hugetlb_get_unmapped_area,
693}; 693};
694 694
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
272 272
273const struct file_operations isofs_dir_operations = 273const struct file_operations isofs_dir_operations =
274{ 274{
275 .llseek = generic_file_llseek,
275 .read = generic_read_dir, 276 .read = generic_read_dir,
276 .readdir = isofs_readdir, 277 .readdir = isofs_readdir,
277}; 278};
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..e214d68620ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
1311 if (handle->h_sync) 1311 if (handle->h_sync)
1312 transaction->t_synchronous_commit = 1; 1312 transaction->t_synchronous_commit = 1;
1313 current->journal_info = NULL; 1313 current->journal_info = NULL;
1314 spin_lock(&journal->j_state_lock);
1315 spin_lock(&transaction->t_handle_lock); 1314 spin_lock(&transaction->t_handle_lock);
1316 transaction->t_outstanding_credits -= handle->h_buffer_credits; 1315 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1317 transaction->t_updates--; 1316 transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
1340 jbd_debug(2, "transaction too old, requesting commit for " 1339 jbd_debug(2, "transaction too old, requesting commit for "
1341 "handle %p\n", handle); 1340 "handle %p\n", handle);
1342 /* This is non-blocking */ 1341 /* This is non-blocking */
1343 __jbd2_log_start_commit(journal, transaction->t_tid); 1342 jbd2_log_start_commit(journal, transaction->t_tid);
1344 spin_unlock(&journal->j_state_lock);
1345 1343
1346 /* 1344 /*
1347 * Special case: JBD2_SYNC synchronous updates require us 1345 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
1351 err = jbd2_log_wait_commit(journal, tid); 1349 err = jbd2_log_wait_commit(journal, tid);
1352 } else { 1350 } else {
1353 spin_unlock(&transaction->t_handle_lock); 1351 spin_unlock(&transaction->t_handle_lock);
1354 spin_unlock(&journal->j_state_lock);
1355 } 1352 }
1356 1353
1357 lock_map_release(&handle->h_lockdep_map); 1354 lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index a33aab6b5e68..54a92fd02bbd 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
234 if (inode->i_mode != mode) { 234 if (inode->i_mode != mode) {
235 struct iattr attr; 235 struct iattr attr;
236 236
237 attr.ia_valid = ATTR_MODE; 237 attr.ia_valid = ATTR_MODE | ATTR_CTIME;
238 attr.ia_mode = mode; 238 attr.ia_mode = mode;
239 attr.ia_ctime = CURRENT_TIME_SEC;
239 rc = jffs2_do_setattr(inode, &attr); 240 rc = jffs2_do_setattr(inode, &attr);
240 if (rc < 0) 241 if (rc < 0)
241 return rc; 242 return rc;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417e085f..166062a68230 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -222,15 +222,18 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
222 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); 222 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
223 223
224 jffs2_free_raw_inode(ri); 224 jffs2_free_raw_inode(ri);
225 d_instantiate(dentry, inode);
226 225
227 D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", 226 D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
228 inode->i_ino, inode->i_mode, inode->i_nlink, 227 inode->i_ino, inode->i_mode, inode->i_nlink,
229 f->inocache->pino_nlink, inode->i_mapping->nrpages)); 228 f->inocache->pino_nlink, inode->i_mapping->nrpages));
229
230 d_instantiate(dentry, inode);
231 unlock_new_inode(inode);
230 return 0; 232 return 0;
231 233
232 fail: 234 fail:
233 make_bad_inode(inode); 235 make_bad_inode(inode);
236 unlock_new_inode(inode);
234 iput(inode); 237 iput(inode);
235 jffs2_free_raw_inode(ri); 238 jffs2_free_raw_inode(ri);
236 return ret; 239 return ret;
@@ -360,8 +363,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
360 /* Eeek. Wave bye bye */ 363 /* Eeek. Wave bye bye */
361 mutex_unlock(&f->sem); 364 mutex_unlock(&f->sem);
362 jffs2_complete_reservation(c); 365 jffs2_complete_reservation(c);
363 jffs2_clear_inode(inode); 366 ret = PTR_ERR(fn);
364 return PTR_ERR(fn); 367 goto fail;
365 } 368 }
366 369
367 /* We use f->target field to store the target path. */ 370 /* We use f->target field to store the target path. */
@@ -370,8 +373,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
370 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); 373 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
371 mutex_unlock(&f->sem); 374 mutex_unlock(&f->sem);
372 jffs2_complete_reservation(c); 375 jffs2_complete_reservation(c);
373 jffs2_clear_inode(inode); 376 ret = -ENOMEM;
374 return -ENOMEM; 377 goto fail;
375 } 378 }
376 379
377 memcpy(f->target, target, targetlen + 1); 380 memcpy(f->target, target, targetlen + 1);
@@ -386,30 +389,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
386 jffs2_complete_reservation(c); 389 jffs2_complete_reservation(c);
387 390
388 ret = jffs2_init_security(inode, dir_i); 391 ret = jffs2_init_security(inode, dir_i);
389 if (ret) { 392 if (ret)
390 jffs2_clear_inode(inode); 393 goto fail;
391 return ret; 394
392 }
393 ret = jffs2_init_acl_post(inode); 395 ret = jffs2_init_acl_post(inode);
394 if (ret) { 396 if (ret)
395 jffs2_clear_inode(inode); 397 goto fail;
396 return ret;
397 }
398 398
399 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 399 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
400 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 400 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
401 if (ret) { 401 if (ret)
402 /* Eep. */ 402 goto fail;
403 jffs2_clear_inode(inode);
404 return ret;
405 }
406 403
407 rd = jffs2_alloc_raw_dirent(); 404 rd = jffs2_alloc_raw_dirent();
408 if (!rd) { 405 if (!rd) {
409 /* Argh. Now we treat it like a normal delete */ 406 /* Argh. Now we treat it like a normal delete */
410 jffs2_complete_reservation(c); 407 jffs2_complete_reservation(c);
411 jffs2_clear_inode(inode); 408 ret = -ENOMEM;
412 return -ENOMEM; 409 goto fail;
413 } 410 }
414 411
415 dir_f = JFFS2_INODE_INFO(dir_i); 412 dir_f = JFFS2_INODE_INFO(dir_i);
@@ -437,8 +434,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
437 jffs2_complete_reservation(c); 434 jffs2_complete_reservation(c);
438 jffs2_free_raw_dirent(rd); 435 jffs2_free_raw_dirent(rd);
439 mutex_unlock(&dir_f->sem); 436 mutex_unlock(&dir_f->sem);
440 jffs2_clear_inode(inode); 437 ret = PTR_ERR(fd);
441 return PTR_ERR(fd); 438 goto fail;
442 } 439 }
443 440
444 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); 441 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -453,7 +450,14 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
453 jffs2_complete_reservation(c); 450 jffs2_complete_reservation(c);
454 451
455 d_instantiate(dentry, inode); 452 d_instantiate(dentry, inode);
453 unlock_new_inode(inode);
456 return 0; 454 return 0;
455
456 fail:
457 make_bad_inode(inode);
458 unlock_new_inode(inode);
459 iput(inode);
460 return ret;
457} 461}
458 462
459 463
@@ -519,8 +523,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
519 /* Eeek. Wave bye bye */ 523 /* Eeek. Wave bye bye */
520 mutex_unlock(&f->sem); 524 mutex_unlock(&f->sem);
521 jffs2_complete_reservation(c); 525 jffs2_complete_reservation(c);
522 jffs2_clear_inode(inode); 526 ret = PTR_ERR(fn);
523 return PTR_ERR(fn); 527 goto fail;
524 } 528 }
525 /* No data here. Only a metadata node, which will be 529 /* No data here. Only a metadata node, which will be
526 obsoleted by the first data write 530 obsoleted by the first data write
@@ -531,30 +535,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
531 jffs2_complete_reservation(c); 535 jffs2_complete_reservation(c);
532 536
533 ret = jffs2_init_security(inode, dir_i); 537 ret = jffs2_init_security(inode, dir_i);
534 if (ret) { 538 if (ret)
535 jffs2_clear_inode(inode); 539 goto fail;
536 return ret; 540
537 }
538 ret = jffs2_init_acl_post(inode); 541 ret = jffs2_init_acl_post(inode);
539 if (ret) { 542 if (ret)
540 jffs2_clear_inode(inode); 543 goto fail;
541 return ret;
542 }
543 544
544 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 545 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
545 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 546 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
546 if (ret) { 547 if (ret)
547 /* Eep. */ 548 goto fail;
548 jffs2_clear_inode(inode);
549 return ret;
550 }
551 549
552 rd = jffs2_alloc_raw_dirent(); 550 rd = jffs2_alloc_raw_dirent();
553 if (!rd) { 551 if (!rd) {
554 /* Argh. Now we treat it like a normal delete */ 552 /* Argh. Now we treat it like a normal delete */
555 jffs2_complete_reservation(c); 553 jffs2_complete_reservation(c);
556 jffs2_clear_inode(inode); 554 ret = -ENOMEM;
557 return -ENOMEM; 555 goto fail;
558 } 556 }
559 557
560 dir_f = JFFS2_INODE_INFO(dir_i); 558 dir_f = JFFS2_INODE_INFO(dir_i);
@@ -582,8 +580,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
582 jffs2_complete_reservation(c); 580 jffs2_complete_reservation(c);
583 jffs2_free_raw_dirent(rd); 581 jffs2_free_raw_dirent(rd);
584 mutex_unlock(&dir_f->sem); 582 mutex_unlock(&dir_f->sem);
585 jffs2_clear_inode(inode); 583 ret = PTR_ERR(fd);
586 return PTR_ERR(fd); 584 goto fail;
587 } 585 }
588 586
589 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); 587 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -599,7 +597,14 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
599 jffs2_complete_reservation(c); 597 jffs2_complete_reservation(c);
600 598
601 d_instantiate(dentry, inode); 599 d_instantiate(dentry, inode);
600 unlock_new_inode(inode);
602 return 0; 601 return 0;
602
603 fail:
604 make_bad_inode(inode);
605 unlock_new_inode(inode);
606 iput(inode);
607 return ret;
603} 608}
604 609
605static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) 610static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
@@ -693,8 +698,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
693 /* Eeek. Wave bye bye */ 698 /* Eeek. Wave bye bye */
694 mutex_unlock(&f->sem); 699 mutex_unlock(&f->sem);
695 jffs2_complete_reservation(c); 700 jffs2_complete_reservation(c);
696 jffs2_clear_inode(inode); 701 ret = PTR_ERR(fn);
697 return PTR_ERR(fn); 702 goto fail;
698 } 703 }
699 /* No data here. Only a metadata node, which will be 704 /* No data here. Only a metadata node, which will be
700 obsoleted by the first data write 705 obsoleted by the first data write
@@ -705,30 +710,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
705 jffs2_complete_reservation(c); 710 jffs2_complete_reservation(c);
706 711
707 ret = jffs2_init_security(inode, dir_i); 712 ret = jffs2_init_security(inode, dir_i);
708 if (ret) { 713 if (ret)
709 jffs2_clear_inode(inode); 714 goto fail;
710 return ret; 715
711 }
712 ret = jffs2_init_acl_post(inode); 716 ret = jffs2_init_acl_post(inode);
713 if (ret) { 717 if (ret)
714 jffs2_clear_inode(inode); 718 goto fail;
715 return ret;
716 }
717 719
718 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 720 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
719 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 721 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
720 if (ret) { 722 if (ret)
721 /* Eep. */ 723 goto fail;
722 jffs2_clear_inode(inode);
723 return ret;
724 }
725 724
726 rd = jffs2_alloc_raw_dirent(); 725 rd = jffs2_alloc_raw_dirent();
727 if (!rd) { 726 if (!rd) {
728 /* Argh. Now we treat it like a normal delete */ 727 /* Argh. Now we treat it like a normal delete */
729 jffs2_complete_reservation(c); 728 jffs2_complete_reservation(c);
730 jffs2_clear_inode(inode); 729 ret = -ENOMEM;
731 return -ENOMEM; 730 goto fail;
732 } 731 }
733 732
734 dir_f = JFFS2_INODE_INFO(dir_i); 733 dir_f = JFFS2_INODE_INFO(dir_i);
@@ -759,8 +758,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
759 jffs2_complete_reservation(c); 758 jffs2_complete_reservation(c);
760 jffs2_free_raw_dirent(rd); 759 jffs2_free_raw_dirent(rd);
761 mutex_unlock(&dir_f->sem); 760 mutex_unlock(&dir_f->sem);
762 jffs2_clear_inode(inode); 761 ret = PTR_ERR(fd);
763 return PTR_ERR(fd); 762 goto fail;
764 } 763 }
765 764
766 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); 765 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -775,8 +774,14 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
775 jffs2_complete_reservation(c); 774 jffs2_complete_reservation(c);
776 775
777 d_instantiate(dentry, inode); 776 d_instantiate(dentry, inode);
778 777 unlock_new_inode(inode);
779 return 0; 778 return 0;
779
780 fail:
781 make_bad_inode(inode);
782 unlock_new_inode(inode);
783 iput(inode);
784 return ret;
780} 785}
781 786
782static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, 787static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a19..813497024437 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -26,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
26 struct page **pagep, void **fsdata); 26 struct page **pagep, void **fsdata);
27static int jffs2_readpage (struct file *filp, struct page *pg); 27static int jffs2_readpage (struct file *filp, struct page *pg);
28 28
29int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync) 29int jffs2_fsync(struct file *filp, int datasync)
30{ 30{
31 struct inode *inode = dentry->d_inode; 31 struct inode *inode = filp->f_mapping->host;
32 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); 32 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
33 33
34 /* Trigger GC to flush any pending writes for this inode */ 34 /* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 86e0821fc989..459d39d1ea0b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
169 mutex_unlock(&f->sem); 169 mutex_unlock(&f->sem);
170 jffs2_complete_reservation(c); 170 jffs2_complete_reservation(c);
171 171
172 /* We have to do the vmtruncate() without f->sem held, since 172 /* We have to do the simple_setsize() without f->sem held, since
173 some pages may be locked and waiting for it in readpage(). 173 some pages may be locked and waiting for it in readpage().
174 We are protected from a simultaneous write() extending i_size 174 We are protected from a simultaneous write() extending i_size
175 back past iattr->ia_size, because do_truncate() holds the 175 back past iattr->ia_size, because do_truncate() holds the
176 generic inode semaphore. */ 176 generic inode semaphore. */
177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { 177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
178 vmtruncate(inode, iattr->ia_size); 178 simple_setsize(inode, iattr->ia_size);
179 inode->i_blocks = (inode->i_size + 511) >> 9; 179 inode->i_blocks = (inode->i_size + 511) >> 9;
180 } 180 }
181 181
@@ -465,7 +465,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
465 inode->i_blocks = 0; 465 inode->i_blocks = 0;
466 inode->i_size = 0; 466 inode->i_size = 0;
467 467
468 insert_inode_hash(inode); 468 if (insert_inode_locked(inode) < 0) {
469 make_bad_inode(inode);
470 unlock_new_inode(inode);
471 iput(inode);
472 return ERR_PTR(-EINVAL);
473 }
469 474
470 return inode; 475 return inode;
471} 476}
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 035a767f958b..4791aacf3084 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
158extern const struct file_operations jffs2_file_operations; 158extern const struct file_operations jffs2_file_operations;
159extern const struct inode_operations jffs2_file_inode_operations; 159extern const struct inode_operations jffs2_file_inode_operations;
160extern const struct address_space_operations jffs2_file_address_operations; 160extern const struct address_space_operations jffs2_file_address_operations;
161int jffs2_fsync(struct file *, struct dentry *, int); 161int jffs2_fsync(struct file *, int);
162int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); 162int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
163 163
164/* ioctl.c */ 164/* ioctl.c */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 85d9ec659225..127263cc8657 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -27,9 +27,9 @@
27#include "jfs_acl.h" 27#include "jfs_acl.h"
28#include "jfs_debug.h" 28#include "jfs_debug.h"
29 29
30int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) 30int jfs_fsync(struct file *file, int datasync)
31{ 31{
32 struct inode *inode = dentry->d_inode; 32 struct inode *inode = file->f_mapping->host;
33 int rc = 0; 33 int rc = 0;
34 34
35 if (!(inode->i_state & I_DIRTY) || 35 if (!(inode->i_state & I_DIRTY) ||
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9e6bda30a6e8..11042b1f44b5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
21struct fid; 21struct fid;
22 22
23extern struct inode *ialloc(struct inode *, umode_t); 23extern struct inode *ialloc(struct inode *, umode_t);
24extern int jfs_fsync(struct file *, struct dentry *, int); 24extern int jfs_fsync(struct file *, int);
25extern long jfs_ioctl(struct file *, unsigned int, unsigned long); 25extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); 26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b66832ac33ac..b38f96bef829 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -179,6 +179,8 @@ static void jfs_put_super(struct super_block *sb)
179 179
180 jfs_info("In jfs_put_super"); 180 jfs_info("In jfs_put_super");
181 181
182 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
183
182 lock_kernel(); 184 lock_kernel();
183 185
184 rc = jfs_umount(sb); 186 rc = jfs_umount(sb);
@@ -396,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
396 398
397 JFS_SBI(sb)->flag = flag; 399 JFS_SBI(sb)->flag = flag;
398 ret = jfs_mount_rw(sb, 1); 400 ret = jfs_mount_rw(sb, 1);
401
402 /* mark the fs r/w for quota activity */
403 sb->s_flags &= ~MS_RDONLY;
404
399 unlock_kernel(); 405 unlock_kernel();
406 dquot_resume(sb, -1);
400 return ret; 407 return ret;
401 } 408 }
402 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 409 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
410 rc = dquot_suspend(sb, -1);
411 if (rc < 0) {
412 unlock_kernel();
413 return rc;
414 }
403 rc = jfs_umount_rw(sb); 415 rc = jfs_umount_rw(sb);
404 JFS_SBI(sb)->flag = flag; 416 JFS_SBI(sb)->flag = flag;
405 unlock_kernel(); 417 unlock_kernel();
@@ -469,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
469 */ 481 */
470 sb->s_op = &jfs_super_operations; 482 sb->s_op = &jfs_super_operations;
471 sb->s_export_op = &jfs_export_operations; 483 sb->s_export_op = &jfs_export_operations;
484#ifdef CONFIG_QUOTA
485 sb->dq_op = &dquot_operations;
486 sb->s_qcop = &dquot_quotactl_ops;
487#endif
472 488
473 /* 489 /*
474 * Initialize direct-mapping inode/address-space 490 * Initialize direct-mapping inode/address-space
diff --git a/fs/libfs.c b/fs/libfs.c
index 232bea425b09..dcaf972cbf1b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/vfs.h> 10#include <linux/vfs.h>
11#include <linux/quotaops.h>
11#include <linux/mutex.h> 12#include <linux/mutex.h>
12#include <linux/exportfs.h> 13#include <linux/exportfs.h>
13#include <linux/writeback.h> 14#include <linux/writeback.h>
@@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
58 return NULL; 59 return NULL;
59} 60}
60 61
61int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
62{
63 return 0;
64}
65
66int dcache_dir_open(struct inode *inode, struct file *file) 62int dcache_dir_open(struct inode *inode, struct file *file)
67{ 63{
68 static struct qstr cursor_name = {.len = 1, .name = "."}; 64 static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = {
190 .llseek = dcache_dir_lseek, 186 .llseek = dcache_dir_lseek,
191 .read = generic_read_dir, 187 .read = generic_read_dir,
192 .readdir = dcache_readdir, 188 .readdir = dcache_readdir,
193 .fsync = simple_sync_file, 189 .fsync = noop_fsync,
194}; 190};
195 191
196const struct inode_operations simple_dir_inode_operations = { 192const struct inode_operations simple_dir_inode_operations = {
@@ -330,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
330 return 0; 326 return 0;
331} 327}
332 328
329/**
330 * simple_setsize - handle core mm and vfs requirements for file size change
331 * @inode: inode
332 * @newsize: new file size
333 *
334 * Returns 0 on success, -error on failure.
335 *
336 * simple_setsize must be called with inode_mutex held.
337 *
338 * simple_setsize will check that the requested new size is OK (see
339 * inode_newsize_ok), and then will perform the necessary i_size update
340 * and pagecache truncation (if necessary). It will be typically be called
341 * from the filesystem's setattr function when ATTR_SIZE is passed in.
342 *
343 * The inode itself must have correct permissions and attributes to allow
344 * i_size to be changed, this function then just checks that the new size
345 * requested is valid.
346 *
347 * In the case of simple in-memory filesystems with inodes stored solely
348 * in the inode cache, and file data in the pagecache, nothing more needs
349 * to be done to satisfy a truncate request. Filesystems with on-disk
350 * blocks for example will need to free them in the case of truncate, in
351 * that case it may be easier not to use simple_setsize (but each of its
352 * components will likely be required at some point to update pagecache
353 * and inode etc).
354 */
355int simple_setsize(struct inode *inode, loff_t newsize)
356{
357 loff_t oldsize;
358 int error;
359
360 error = inode_newsize_ok(inode, newsize);
361 if (error)
362 return error;
363
364 oldsize = inode->i_size;
365 i_size_write(inode, newsize);
366 truncate_pagecache(inode, oldsize, newsize);
367
368 return error;
369}
370EXPORT_SYMBOL(simple_setsize);
371
372/**
373 * simple_setattr - setattr for simple in-memory filesystem
374 * @dentry: dentry
375 * @iattr: iattr structure
376 *
377 * Returns 0 on success, -error on failure.
378 *
379 * simple_setattr implements setattr for an in-memory filesystem which
380 * does not store its own file data or metadata (eg. uses the page cache
381 * and inode cache as its data store).
382 */
383int simple_setattr(struct dentry *dentry, struct iattr *iattr)
384{
385 struct inode *inode = dentry->d_inode;
386 int error;
387
388 error = inode_change_ok(inode, iattr);
389 if (error)
390 return error;
391
392 if (iattr->ia_valid & ATTR_SIZE) {
393 error = simple_setsize(inode, iattr->ia_size);
394 if (error)
395 return error;
396 }
397
398 generic_setattr(inode, iattr);
399
400 return error;
401}
402EXPORT_SYMBOL(simple_setattr);
403
333int simple_readpage(struct file *file, struct page *page) 404int simple_readpage(struct file *file, struct page *page)
334{ 405{
335 clear_highpage(page); 406 clear_highpage(page);
@@ -418,7 +489,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
418 * unique inode values later for this filesystem, then you must take care 489 * unique inode values later for this filesystem, then you must take care
419 * to pass it an appropriate max_reserved value to avoid collisions. 490 * to pass it an appropriate max_reserved value to avoid collisions.
420 */ 491 */
421int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files) 492int simple_fill_super(struct super_block *s, unsigned long magic,
493 struct tree_descr *files)
422{ 494{
423 struct inode *inode; 495 struct inode *inode;
424 struct dentry *root; 496 struct dentry *root;
@@ -851,13 +923,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
851} 923}
852EXPORT_SYMBOL_GPL(generic_fh_to_parent); 924EXPORT_SYMBOL_GPL(generic_fh_to_parent);
853 925
854int simple_fsync(struct file *file, struct dentry *dentry, int datasync) 926/**
927 * generic_file_fsync - generic fsync implementation for simple filesystems
928 * @file: file to synchronize
929 * @datasync: only synchronize essential metadata if true
930 *
931 * This is a generic implementation of the fsync method for simple
932 * filesystems which track all non-inode metadata in the buffers list
933 * hanging off the address_space structure.
934 */
935int generic_file_fsync(struct file *file, int datasync)
855{ 936{
856 struct writeback_control wbc = { 937 struct writeback_control wbc = {
857 .sync_mode = WB_SYNC_ALL, 938 .sync_mode = WB_SYNC_ALL,
858 .nr_to_write = 0, /* metadata-only; caller takes care of data */ 939 .nr_to_write = 0, /* metadata-only; caller takes care of data */
859 }; 940 };
860 struct inode *inode = dentry->d_inode; 941 struct inode *inode = file->f_mapping->host;
861 int err; 942 int err;
862 int ret; 943 int ret;
863 944
@@ -872,7 +953,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
872 ret = err; 953 ret = err;
873 return ret; 954 return ret;
874} 955}
875EXPORT_SYMBOL(simple_fsync); 956EXPORT_SYMBOL(generic_file_fsync);
957
958/*
959 * No-op implementation of ->fsync for in-memory filesystems.
960 */
961int noop_fsync(struct file *file, int datasync)
962{
963 return 0;
964}
876 965
877EXPORT_SYMBOL(dcache_dir_close); 966EXPORT_SYMBOL(dcache_dir_close);
878EXPORT_SYMBOL(dcache_dir_lseek); 967EXPORT_SYMBOL(dcache_dir_lseek);
@@ -895,7 +984,7 @@ EXPORT_SYMBOL(simple_release_fs);
895EXPORT_SYMBOL(simple_rename); 984EXPORT_SYMBOL(simple_rename);
896EXPORT_SYMBOL(simple_rmdir); 985EXPORT_SYMBOL(simple_rmdir);
897EXPORT_SYMBOL(simple_statfs); 986EXPORT_SYMBOL(simple_statfs);
898EXPORT_SYMBOL(simple_sync_file); 987EXPORT_SYMBOL(noop_fsync);
899EXPORT_SYMBOL(simple_unlink); 988EXPORT_SYMBOL(simple_unlink);
900EXPORT_SYMBOL(simple_read_from_buffer); 989EXPORT_SYMBOL(simple_read_from_buffer);
901EXPORT_SYMBOL(simple_write_to_buffer); 990EXPORT_SYMBOL(simple_write_to_buffer);
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 0de524071870..abe1cafbd4c2 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -219,9 +219,9 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
219 } 219 }
220} 220}
221 221
222int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) 222int logfs_fsync(struct file *file, int datasync)
223{ 223{
224 struct super_block *sb = dentry->d_inode->i_sb; 224 struct super_block *sb = file->f_mapping->host->i_sb;
225 225
226 logfs_write_anchor(sb); 226 logfs_write_anchor(sb);
227 return 0; 227 return 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 1a9db84f8d8f..c838c4d72111 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops;
506int logfs_readpage(struct file *file, struct page *page); 506int logfs_readpage(struct file *file, struct page *page);
507int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, 507int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
508 unsigned long arg); 508 unsigned long arg);
509int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); 509int logfs_fsync(struct file *file, int datasync);
510 510
511/* gc.c */ 511/* gc.c */
512u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); 512u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..1dbf921ca44b 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .read = generic_read_dir, 23 .read = generic_read_dir,
24 .readdir = minix_readdir, 24 .readdir = minix_readdir,
25 .fsync = simple_fsync, 25 .fsync = generic_file_fsync,
26}; 26};
27 27
28static inline void dir_put_page(struct page *page) 28static inline void dir_put_page(struct page *page)
@@ -72,16 +72,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
72{ 72{
73 struct address_space *mapping = dir->i_mapping; 73 struct address_space *mapping = dir->i_mapping;
74 struct page *page = read_mapping_page(mapping, n, NULL); 74 struct page *page = read_mapping_page(mapping, n, NULL);
75 if (!IS_ERR(page)) { 75 if (!IS_ERR(page))
76 kmap(page); 76 kmap(page);
77 if (!PageUptodate(page))
78 goto fail;
79 }
80 return page; 77 return page;
81
82fail:
83 dir_put_page(page);
84 return ERR_PTR(-EIO);
85} 78}
86 79
87static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) 80static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..d5320ff23faf 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = {
19 .write = do_sync_write, 19 .write = do_sync_write,
20 .aio_write = generic_file_aio_write, 20 .aio_write = generic_file_aio_write,
21 .mmap = generic_file_mmap, 21 .mmap = generic_file_mmap,
22 .fsync = simple_fsync, 22 .fsync = generic_file_fsync,
23 .splice_read = generic_file_splice_read, 23 .splice_read = generic_file_splice_read,
24}; 24};
25 25
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
20 return (block_t *)minix_i(inode)->u.i2_data; 20 return (block_t *)minix_i(inode)->u.i2_data;
21} 21}
22 22
23#define DIRCOUNT 7
24#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
25
23static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) 26static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
24{ 27{
25 int n = 0; 28 int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
34 printk("MINIX-fs: block_to_path: " 37 printk("MINIX-fs: block_to_path: "
35 "block %ld too big on dev %s\n", 38 "block %ld too big on dev %s\n",
36 block, bdevname(sb->s_bdev, b)); 39 block, bdevname(sb->s_bdev, b));
37 } else if (block < 7) { 40 } else if (block < DIRCOUNT) {
38 offsets[n++] = block; 41 offsets[n++] = block;
39 } else if ((block -= 7) < 256) { 42 } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
40 offsets[n++] = 7; 43 offsets[n++] = DIRCOUNT;
41 offsets[n++] = block; 44 offsets[n++] = block;
42 } else if ((block -= 256) < 256*256) { 45 } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
43 offsets[n++] = 8; 46 offsets[n++] = DIRCOUNT + 1;
44 offsets[n++] = block>>8; 47 offsets[n++] = block / INDIRCOUNT(sb);
45 offsets[n++] = block & 255; 48 offsets[n++] = block % INDIRCOUNT(sb);
46 } else { 49 } else {
47 block -= 256*256; 50 block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
48 offsets[n++] = 9; 51 offsets[n++] = DIRCOUNT + 2;
49 offsets[n++] = block>>16; 52 offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
50 offsets[n++] = (block>>8) & 255; 53 offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
51 offsets[n++] = block & 255; 54 offsets[n++] = block % INDIRCOUNT(sb);
52 } 55 }
53 return n; 56 return n;
54} 57}
diff --git a/fs/namei.c b/fs/namei.c
index 48e1f60520ea..868d0cb9d473 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1621,6 +1621,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1621 case LAST_DOTDOT: 1621 case LAST_DOTDOT:
1622 follow_dotdot(nd); 1622 follow_dotdot(nd);
1623 dir = nd->path.dentry; 1623 dir = nd->path.dentry;
1624 case LAST_DOT:
1624 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { 1625 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1625 if (!dir->d_op->d_revalidate(dir, nd)) { 1626 if (!dir->d_op->d_revalidate(dir, nd)) {
1626 error = -ESTALE; 1627 error = -ESTALE;
@@ -1628,7 +1629,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1628 } 1629 }
1629 } 1630 }
1630 /* fallthrough */ 1631 /* fallthrough */
1631 case LAST_DOT:
1632 case LAST_ROOT: 1632 case LAST_ROOT:
1633 if (open_flag & O_CREAT) 1633 if (open_flag & O_CREAT)
1634 goto exit; 1634 goto exit;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 92dde6f8d893..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,6 +49,7 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
49 49
50const struct file_operations ncp_dir_operations = 50const struct file_operations ncp_dir_operations =
51{ 51{
52 .llseek = generic_file_llseek,
52 .read = generic_read_dir, 53 .read = generic_read_dir,
53 .readdir = ncp_readdir, 54 .readdir = ncp_readdir,
54 .unlocked_ioctl = ncp_ioctl, 55 .unlocked_ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index b93870892892..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -22,7 +22,7 @@
22#include <linux/ncp_fs.h> 22#include <linux/ncp_fs.h>
23#include "ncplib_kernel.h" 23#include "ncplib_kernel.h"
24 24
25static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync) 25static int ncp_fsync(struct file *file, int datasync)
26{ 26{
27 return 0; 27 return 0;
28} 28}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ee9a179ebdf3..782b431ef91c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); 53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
54static int nfs_rename(struct inode *, struct dentry *, 54static int nfs_rename(struct inode *, struct dentry *,
55 struct inode *, struct dentry *); 55 struct inode *, struct dentry *);
56static int nfs_fsync_dir(struct file *, struct dentry *, int); 56static int nfs_fsync_dir(struct file *, int);
57static loff_t nfs_llseek_dir(struct file *, loff_t, int); 57static loff_t nfs_llseek_dir(struct file *, loff_t, int);
58 58
59const struct file_operations nfs_dir_operations = { 59const struct file_operations nfs_dir_operations = {
@@ -641,8 +641,10 @@ out:
641 * All directory operations under NFS are synchronous, so fsync() 641 * All directory operations under NFS are synchronous, so fsync()
642 * is a dummy operation. 642 * is a dummy operation.
643 */ 643 */
644static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) 644static int nfs_fsync_dir(struct file *filp, int datasync)
645{ 645{
646 struct dentry *dentry = filp->f_path.dentry;
647
646 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", 648 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
647 dentry->d_parent->d_name.name, dentry->d_name.name, 649 dentry->d_parent->d_name.name, dentry->d_name.name,
648 datasync); 650 datasync);
@@ -1741,6 +1743,7 @@ remove_lru_entry:
1741 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); 1743 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1742 smp_mb__after_clear_bit(); 1744 smp_mb__after_clear_bit();
1743 } 1745 }
1746 spin_unlock(&inode->i_lock);
1744 } 1747 }
1745 spin_unlock(&nfs_access_lru_lock); 1748 spin_unlock(&nfs_access_lru_lock);
1746 nfs_access_free_list(&head); 1749 nfs_access_free_list(&head);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cac96bcc91e4..36a5e74f51b4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -53,7 +53,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
54 unsigned long nr_segs, loff_t pos); 54 unsigned long nr_segs, loff_t pos);
55static int nfs_file_flush(struct file *, fl_owner_t id); 55static int nfs_file_flush(struct file *, fl_owner_t id);
56static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync); 56static int nfs_file_fsync(struct file *, int datasync);
57static int nfs_check_flags(int flags); 57static int nfs_check_flags(int flags);
58static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); 58static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -322,8 +322,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
322 * whether any write errors occurred for this process. 322 * whether any write errors occurred for this process.
323 */ 323 */
324static int 324static int
325nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 325nfs_file_fsync(struct file *file, int datasync)
326{ 326{
327 struct dentry *dentry = file->f_path.dentry;
327 struct nfs_open_context *ctx = nfs_file_open_context(file); 328 struct nfs_open_context *ctx = nfs_file_open_context(file);
328 struct inode *inode = dentry->d_inode; 329 struct inode *inode = dentry->d_inode;
329 330
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..91679e2631ee 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1386,7 +1386,7 @@ static int nfs_commit_inode(struct inode *inode, int how)
1386 int res = 0; 1386 int res = 0;
1387 1387
1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) 1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
1389 goto out; 1389 goto out_mark_dirty;
1390 spin_lock(&inode->i_lock); 1390 spin_lock(&inode->i_lock);
1391 res = nfs_scan_commit(inode, &head, 0, 0); 1391 res = nfs_scan_commit(inode, &head, 0, 0);
1392 spin_unlock(&inode->i_lock); 1392 spin_unlock(&inode->i_lock);
@@ -1398,9 +1398,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, 1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
1399 nfs_wait_bit_killable, 1399 nfs_wait_bit_killable,
1400 TASK_KILLABLE); 1400 TASK_KILLABLE);
1401 else
1402 goto out_mark_dirty;
1401 } else 1403 } else
1402 nfs_commit_clear_lock(NFS_I(inode)); 1404 nfs_commit_clear_lock(NFS_I(inode));
1403out: 1405 return res;
1406 /* Note: If we exit without ensuring that the commit is complete,
1407 * we must mark the inode as dirty. Otherwise, future calls to
1408 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
1409 * that the data is on the disk.
1410 */
1411out_mark_dirty:
1412 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1404 return res; 1413 return res;
1405} 1414}
1406 1415
@@ -1509,14 +1518,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
1509 }; 1518 };
1510 int ret; 1519 int ret;
1511 1520
1512 while(PagePrivate(page)) { 1521 for (;;) {
1513 wait_on_page_writeback(page); 1522 wait_on_page_writeback(page);
1514 if (clear_page_dirty_for_io(page)) { 1523 if (clear_page_dirty_for_io(page)) {
1515 ret = nfs_writepage_locked(page, &wbc); 1524 ret = nfs_writepage_locked(page, &wbc);
1516 if (ret < 0) 1525 if (ret < 0)
1517 goto out_error; 1526 goto out_error;
1527 continue;
1518 } 1528 }
1519 ret = sync_inode(inode, &wbc); 1529 if (!PagePrivate(page))
1530 break;
1531 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1520 if (ret < 0) 1532 if (ret < 0)
1521 goto out_error; 1533 goto out_error;
1522 } 1534 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 12f7109720c2..4a2734758778 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4122,8 +4122,8 @@ nfs4_state_shutdown(void)
4122 nfs4_lock_state(); 4122 nfs4_lock_state();
4123 nfs4_release_reclaim(); 4123 nfs4_release_reclaim();
4124 __nfs4_state_shutdown(); 4124 __nfs4_state_shutdown();
4125 nfsd4_destroy_callback_queue();
4126 nfs4_unlock_state(); 4125 nfs4_unlock_state();
4126 nfsd4_destroy_callback_queue();
4127} 4127}
4128 4128
4129/* 4129/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ebbf3b6b2457..3c111120b619 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -443,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
443 if (size_change) 443 if (size_change)
444 put_write_access(inode); 444 put_write_access(inode);
445 if (!err) 445 if (!err)
446 if (EX_ISSYNC(fhp->fh_export)) 446 commit_metadata(fhp);
447 write_inode_now(inode, 1);
448out: 447out:
449 return err; 448 return err;
450 449
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index af638d59e3bf..43c8c5b541fd 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -75,8 +75,6 @@ struct nilfs_btree_path {
75 75
76extern struct kmem_cache *nilfs_btree_path_cache; 76extern struct kmem_cache *nilfs_btree_path_cache;
77 77
78int nilfs_btree_path_cache_init(void);
79void nilfs_btree_path_cache_destroy(void);
80int nilfs_btree_init(struct nilfs_bmap *); 78int nilfs_btree_init(struct nilfs_bmap *);
81int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, 79int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
82 const __u64 *, const __u64 *, int); 80 const __u64 *, const __u64 *, int);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
27#include "nilfs.h" 27#include "nilfs.h"
28#include "segment.h" 28#include "segment.h"
29 29
30int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 30int nilfs_sync_file(struct file *file, int datasync)
31{ 31{
32 /* 32 /*
33 * Called from fsync() system call 33 * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
37 * This function should be implemented when the writeback function 37 * This function should be implemented when the writeback function
38 * will be implemented. 38 * will be implemented.
39 */ 39 */
40 struct inode *inode = dentry->d_inode; 40 struct inode *inode = file->f_mapping->host;
41 int err; 41 int err;
42 42
43 if (!nilfs_inode_dirty(inode)) 43 if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..47d6d7928122 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
228 struct page *, struct inode *); 228 struct page *, struct inode *);
229 229
230/* file.c */ 230/* file.c */
231extern int nilfs_sync_file(struct file *, struct dentry *, int); 231extern int nilfs_sync_file(struct file *, int);
232 232
233/* ioctl.c */ 233/* ioctl.c */
234long nilfs_ioctl(struct file *, unsigned int, unsigned long); 234long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fdf1c3b6d673..85fbb66455e2 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -127,8 +127,6 @@ struct nilfs_segment_buffer {
127 127
128extern struct kmem_cache *nilfs_segbuf_cachep; 128extern struct kmem_cache *nilfs_segbuf_cachep;
129 129
130int __init nilfs_init_segbuf_cache(void);
131void nilfs_destroy_segbuf_cache(void);
132struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); 130struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
133void nilfs_segbuf_free(struct nilfs_segment_buffer *); 131void nilfs_segbuf_free(struct nilfs_segment_buffer *);
134void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, 132void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index dca142361ccf..01e20dbb217d 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -221,8 +221,6 @@ enum {
221extern struct kmem_cache *nilfs_transaction_cachep; 221extern struct kmem_cache *nilfs_transaction_cachep;
222 222
223/* segment.c */ 223/* segment.c */
224extern int nilfs_init_transaction_cache(void);
225extern void nilfs_destroy_transaction_cache(void);
226extern void nilfs_relax_pressure_in_lock(struct super_block *); 224extern void nilfs_relax_pressure_in_lock(struct super_block *);
227 225
228extern int nilfs_construct_segment(struct super_block *); 226extern int nilfs_construct_segment(struct super_block *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 03b34b738993..414ef68931cf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1130,13 +1130,13 @@ static void nilfs_segbuf_init_once(void *obj)
1130 1130
1131static void nilfs_destroy_cachep(void) 1131static void nilfs_destroy_cachep(void)
1132{ 1132{
1133 if (nilfs_inode_cachep) 1133 if (nilfs_inode_cachep)
1134 kmem_cache_destroy(nilfs_inode_cachep); 1134 kmem_cache_destroy(nilfs_inode_cachep);
1135 if (nilfs_transaction_cachep) 1135 if (nilfs_transaction_cachep)
1136 kmem_cache_destroy(nilfs_transaction_cachep); 1136 kmem_cache_destroy(nilfs_transaction_cachep);
1137 if (nilfs_segbuf_cachep) 1137 if (nilfs_segbuf_cachep)
1138 kmem_cache_destroy(nilfs_segbuf_cachep); 1138 kmem_cache_destroy(nilfs_segbuf_cachep);
1139 if (nilfs_btree_path_cache) 1139 if (nilfs_btree_path_cache)
1140 kmem_cache_destroy(nilfs_btree_path_cache); 1140 kmem_cache_destroy(nilfs_btree_path_cache);
1141} 1141}
1142 1142
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3feee4a..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
1527 * this problem for now. We do write the $BITMAP attribute if it is present 1527 * this problem for now. We do write the $BITMAP attribute if it is present
1528 * which is the important one for a directory so things are not too bad. 1528 * which is the important one for a directory so things are not too bad.
1529 */ 1529 */
1530static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, 1530static int ntfs_dir_fsync(struct file *filp, int datasync)
1531 int datasync)
1532{ 1531{
1533 struct inode *bmp_vi, *vi = dentry->d_inode; 1532 struct inode *bmp_vi, *vi = filp->f_mapping->host;
1534 int err, ret; 1533 int err, ret;
1535 ntfs_attr na; 1534 ntfs_attr na;
1536 1535
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index a1924a0d2ab0..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2133,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2133/** 2133/**
2134 * ntfs_file_fsync - sync a file to disk 2134 * ntfs_file_fsync - sync a file to disk
2135 * @filp: file to be synced 2135 * @filp: file to be synced
2136 * @dentry: dentry describing the file to sync
2137 * @datasync: if non-zero only flush user data and not metadata 2136 * @datasync: if non-zero only flush user data and not metadata
2138 * 2137 *
2139 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 2138 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
@@ -2149,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2149 * Also, if @datasync is true, we do not wait on the inode to be written out 2148 * Also, if @datasync is true, we do not wait on the inode to be written out
2150 * but we always wait on the page cache pages to be written out. 2149 * but we always wait on the page cache pages to be written out.
2151 * 2150 *
2152 * Note: In the past @filp could be NULL so we ignore it as we don't need it
2153 * anyway.
2154 *
2155 * Locking: Caller must hold i_mutex on the inode. 2151 * Locking: Caller must hold i_mutex on the inode.
2156 * 2152 *
2157 * TODO: We should probably also write all attribute/index inodes associated 2153 * TODO: We should probably also write all attribute/index inodes associated
2158 * with this inode but since we have no simple way of getting to them we ignore 2154 * with this inode but since we have no simple way of getting to them we ignore
2159 * this problem for now. 2155 * this problem for now.
2160 */ 2156 */
2161static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, 2157static int ntfs_file_fsync(struct file *filp, int datasync)
2162 int datasync)
2163{ 2158{
2164 struct inode *vi = dentry->d_inode; 2159 struct inode *vi = filp->f_mapping->host;
2165 int err, ret = 0; 2160 int err, ret = 0;
2166 2161
2167 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2162 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 97e54b9e654b..6a13ea64c447 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
175 return 0; 175 return 0;
176} 176}
177 177
178static int ocfs2_sync_file(struct file *file, 178static int ocfs2_sync_file(struct file *file, int datasync)
179 struct dentry *dentry,
180 int datasync)
181{ 179{
182 int err = 0; 180 int err = 0;
183 journal_t *journal; 181 journal_t *journal;
184 struct inode *inode = dentry->d_inode; 182 struct dentry *dentry = file->f_path.dentry;
183 struct inode *inode = file->f_mapping->host;
185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
186 185
187 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 186 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -1053,7 +1052,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1053 } 1052 }
1054 1053
1055 /* 1054 /*
1056 * This will intentionally not wind up calling vmtruncate(), 1055 * This will intentionally not wind up calling simple_setsize(),
1057 * since all the work for a size change has been done above. 1056 * since all the work for a size change has been done above.
1058 * Otherwise, we could get into problems with truncate as 1057 * Otherwise, we could get into problems with truncate as
1059 * ip_alloc_sem is used there to protect against i_size 1058 * ip_alloc_sem is used there to protect against i_size
@@ -2119,9 +2118,13 @@ relock:
2119 * direct write may have instantiated a few 2118 * direct write may have instantiated a few
2120 * blocks outside i_size. Trim these off again. 2119 * blocks outside i_size. Trim these off again.
2121 * Don't need i_size_read because we hold i_mutex. 2120 * Don't need i_size_read because we hold i_mutex.
2121 *
2122 * XXX(hch): this looks buggy because ocfs2 did not
2123 * actually implement ->truncate. Take a look at
2124 * the new truncate sequence and update this accordingly
2122 */ 2125 */
2123 if (*ppos + count > inode->i_size) 2126 if (*ppos + count > inode->i_size)
2124 vmtruncate(inode, inode->i_size); 2127 simple_setsize(inode, inode->i_size);
2125 ret = written; 2128 ret = written;
2126 goto out_dio; 2129 goto out_dio;
2127 } 2130 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2c26ce251cb3..0eaa929a4dbf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -879,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
879 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 879 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
880 continue; 880 continue;
881 if (unsuspend) 881 if (unsuspend)
882 status = vfs_quota_enable( 882 status = dquot_resume(sb, type);
883 sb_dqopt(sb)->files[type], 883 else {
884 type, QFMT_OCFS2, 884 struct ocfs2_mem_dqinfo *oinfo;
885 DQUOT_SUSPENDED); 885
886 else 886 /* Cancel periodic syncing before suspending */
887 status = vfs_quota_disable(sb, type, 887 oinfo = sb_dqinfo(sb, type)->dqi_priv;
888 DQUOT_SUSPENDED); 888 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
889 status = dquot_suspend(sb, type);
890 }
889 if (status < 0) 891 if (status < 0)
890 break; 892 break;
891 } 893 }
@@ -916,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
916 status = -ENOENT; 918 status = -ENOENT;
917 goto out_quota_off; 919 goto out_quota_off;
918 } 920 }
919 status = vfs_quota_enable(inode[type], type, QFMT_OCFS2, 921 status = dquot_enable(inode[type], type, QFMT_OCFS2,
920 DQUOT_USAGE_ENABLED); 922 DQUOT_USAGE_ENABLED);
921 if (status < 0) 923 if (status < 0)
922 goto out_quota_off; 924 goto out_quota_off;
923 } 925 }
@@ -952,8 +954,8 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
952 /* Turn off quotas. This will remove all dquot structures from 954 /* Turn off quotas. This will remove all dquot structures from
953 * memory and so they will be automatically synced to global 955 * memory and so they will be automatically synced to global
954 * quota files */ 956 * quota files */
955 vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED | 957 dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
956 DQUOT_LIMITS_ENABLED); 958 DQUOT_LIMITS_ENABLED);
957 if (!inode) 959 if (!inode)
958 continue; 960 continue;
959 iput(inode); 961 iput(inode);
@@ -962,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
962 964
963/* Handle quota on quotactl */ 965/* Handle quota on quotactl */
964static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, 966static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
965 char *path, int remount) 967 char *path)
966{ 968{
967 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 969 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
968 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 970 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -970,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
970 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 972 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
971 return -EINVAL; 973 return -EINVAL;
972 974
973 if (remount) 975 return dquot_enable(sb_dqopt(sb)->files[type], type,
974 return 0; /* Just ignore it has been handled in 976 format_id, DQUOT_LIMITS_ENABLED);
975 * ocfs2_remount() */
976 return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
977 format_id, DQUOT_LIMITS_ENABLED);
978} 977}
979 978
980/* Handle quota off quotactl */ 979/* Handle quota off quotactl */
981static int ocfs2_quota_off(struct super_block *sb, int type, int remount) 980static int ocfs2_quota_off(struct super_block *sb, int type)
982{ 981{
983 if (remount) 982 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
984 return 0; /* Ignore now and handle later in
985 * ocfs2_remount() */
986 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
987} 983}
988 984
989static const struct quotactl_ops ocfs2_quotactl_ops = { 985static const struct quotactl_ops ocfs2_quotactl_ops = {
990 .quota_on = ocfs2_quota_on, 986 .quota_on = ocfs2_quota_on,
991 .quota_off = ocfs2_quota_off, 987 .quota_off = ocfs2_quota_off,
992 .quota_sync = vfs_quota_sync, 988 .quota_sync = dquot_quota_sync,
993 .get_info = vfs_get_dqinfo, 989 .get_info = dquot_get_dqinfo,
994 .set_info = vfs_set_dqinfo, 990 .set_info = dquot_set_dqinfo,
995 .get_dqblk = vfs_get_dqblk, 991 .get_dqblk = dquot_get_dqblk,
996 .set_dqblk = vfs_set_dqblk, 992 .set_dqblk = dquot_set_dqblk,
997}; 993};
998 994
999static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 995static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..6e7a3291bbe8 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = {
329 .aio_read = generic_file_aio_read, 329 .aio_read = generic_file_aio_read,
330 .aio_write = generic_file_aio_write, 330 .aio_write = generic_file_aio_write,
331 .mmap = generic_file_mmap, 331 .mmap = generic_file_mmap,
332 .fsync = simple_fsync, 332 .fsync = generic_file_fsync,
333 .splice_read = generic_file_splice_read, 333 .splice_read = generic_file_splice_read,
334}; 334};
335 335
diff --git a/fs/pipe.c b/fs/pipe.c
index d79872eba09a..279eef96c51c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,9 +26,14 @@
26 26
27/* 27/*
28 * The max size that a non-root user is allowed to grow the pipe. Can 28 * The max size that a non-root user is allowed to grow the pipe. Can
29 * be set by root in /proc/sys/fs/pipe-max-pages 29 * be set by root in /proc/sys/fs/pipe-max-size
30 */ 30 */
31unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; 31unsigned int pipe_max_size = 1048576;
32
33/*
34 * Minimum pipe size, as required by POSIX
35 */
36unsigned int pipe_min_size = PAGE_SIZE;
32 37
33/* 38/*
34 * We use a start+len construction, which provides full use of the 39 * We use a start+len construction, which provides full use of the
@@ -230,6 +235,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
230 235
231 return kmap(buf->page); 236 return kmap(buf->page);
232} 237}
238EXPORT_SYMBOL(generic_pipe_buf_map);
233 239
234/** 240/**
235 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 241 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -249,6 +255,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
249 } else 255 } else
250 kunmap(buf->page); 256 kunmap(buf->page);
251} 257}
258EXPORT_SYMBOL(generic_pipe_buf_unmap);
252 259
253/** 260/**
254 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 261 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -279,6 +286,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
279 286
280 return 1; 287 return 1;
281} 288}
289EXPORT_SYMBOL(generic_pipe_buf_steal);
282 290
283/** 291/**
284 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 292 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -294,6 +302,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
294{ 302{
295 page_cache_get(buf->page); 303 page_cache_get(buf->page);
296} 304}
305EXPORT_SYMBOL(generic_pipe_buf_get);
297 306
298/** 307/**
299 * generic_pipe_buf_confirm - verify contents of the pipe buffer 308 * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -309,6 +318,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
309{ 318{
310 return 0; 319 return 0;
311} 320}
321EXPORT_SYMBOL(generic_pipe_buf_confirm);
312 322
313/** 323/**
314 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 324 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -323,6 +333,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
323{ 333{
324 page_cache_release(buf->page); 334 page_cache_release(buf->page);
325} 335}
336EXPORT_SYMBOL(generic_pipe_buf_release);
326 337
327static const struct pipe_buf_operations anon_pipe_buf_ops = { 338static const struct pipe_buf_operations anon_pipe_buf_ops = {
328 .can_merge = 1, 339 .can_merge = 1,
@@ -1112,26 +1123,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
1112 * Allocate a new array of pipe buffers and copy the info over. Returns the 1123 * Allocate a new array of pipe buffers and copy the info over. Returns the
1113 * pipe size if successful, or return -ERROR on error. 1124 * pipe size if successful, or return -ERROR on error.
1114 */ 1125 */
1115static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1126static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1116{ 1127{
1117 struct pipe_buffer *bufs; 1128 struct pipe_buffer *bufs;
1118 1129
1119 /* 1130 /*
1120 * Must be a power-of-2 currently
1121 */
1122 if (!is_power_of_2(arg))
1123 return -EINVAL;
1124
1125 /*
1126 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1131 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1127 * expect a lot of shrink+grow operations, just free and allocate 1132 * expect a lot of shrink+grow operations, just free and allocate
1128 * again like we would do for growing. If the pipe currently 1133 * again like we would do for growing. If the pipe currently
1129 * contains more buffers than arg, then return busy. 1134 * contains more buffers than arg, then return busy.
1130 */ 1135 */
1131 if (arg < pipe->nrbufs) 1136 if (nr_pages < pipe->nrbufs)
1132 return -EBUSY; 1137 return -EBUSY;
1133 1138
1134 bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); 1139 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
1135 if (unlikely(!bufs)) 1140 if (unlikely(!bufs))
1136 return -ENOMEM; 1141 return -ENOMEM;
1137 1142
@@ -1140,20 +1145,56 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1140 * and adjust the indexes. 1145 * and adjust the indexes.
1141 */ 1146 */
1142 if (pipe->nrbufs) { 1147 if (pipe->nrbufs) {
1143 const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1); 1148 unsigned int tail;
1144 const unsigned int head = pipe->nrbufs - tail; 1149 unsigned int head;
1145 1150
1151 tail = pipe->curbuf + pipe->nrbufs;
1152 if (tail < pipe->buffers)
1153 tail = 0;
1154 else
1155 tail &= (pipe->buffers - 1);
1156
1157 head = pipe->nrbufs - tail;
1146 if (head) 1158 if (head)
1147 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1159 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1148 if (tail) 1160 if (tail)
1149 memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer)); 1161 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
1150 } 1162 }
1151 1163
1152 pipe->curbuf = 0; 1164 pipe->curbuf = 0;
1153 kfree(pipe->bufs); 1165 kfree(pipe->bufs);
1154 pipe->bufs = bufs; 1166 pipe->bufs = bufs;
1155 pipe->buffers = arg; 1167 pipe->buffers = nr_pages;
1156 return arg; 1168 return nr_pages * PAGE_SIZE;
1169}
1170
1171/*
1172 * Currently we rely on the pipe array holding a power-of-2 number
1173 * of pages.
1174 */
1175static inline unsigned int round_pipe_size(unsigned int size)
1176{
1177 unsigned long nr_pages;
1178
1179 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1180 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
1181}
1182
1183/*
1184 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
1185 * will return an error.
1186 */
1187int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1188 size_t *lenp, loff_t *ppos)
1189{
1190 int ret;
1191
1192 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
1193 if (ret < 0 || !write)
1194 return ret;
1195
1196 pipe_max_size = round_pipe_size(pipe_max_size);
1197 return ret;
1157} 1198}
1158 1199
1159long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1200long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1168,25 +1209,32 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1168 mutex_lock(&pipe->inode->i_mutex); 1209 mutex_lock(&pipe->inode->i_mutex);
1169 1210
1170 switch (cmd) { 1211 switch (cmd) {
1171 case F_SETPIPE_SZ: 1212 case F_SETPIPE_SZ: {
1172 if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) 1213 unsigned int size, nr_pages;
1173 return -EINVAL; 1214
1174 /* 1215 size = round_pipe_size(arg);
1175 * The pipe needs to be at least 2 pages large to 1216 nr_pages = size >> PAGE_SHIFT;
1176 * guarantee POSIX behaviour. 1217
1177 */ 1218 ret = -EINVAL;
1178 if (arg < 2) 1219 if (!nr_pages)
1179 return -EINVAL; 1220 goto out;
1180 ret = pipe_set_size(pipe, arg); 1221
1222 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
1223 ret = -EPERM;
1224 goto out;
1225 }
1226 ret = pipe_set_size(pipe, nr_pages);
1181 break; 1227 break;
1228 }
1182 case F_GETPIPE_SZ: 1229 case F_GETPIPE_SZ:
1183 ret = pipe->buffers; 1230 ret = pipe->buffers * PAGE_SIZE;
1184 break; 1231 break;
1185 default: 1232 default:
1186 ret = -EINVAL; 1233 ret = -EINVAL;
1187 break; 1234 break;
1188 } 1235 }
1189 1236
1237out:
1190 mutex_unlock(&pipe->inode->i_mutex); 1238 mutex_unlock(&pipe->inode->i_mutex);
1191 return ret; 1239 return ret;
1192} 1240}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..9b58d38bc911 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
267 shpending = p->signal->shared_pending.signal; 267 shpending = p->signal->shared_pending.signal;
268 blocked = p->blocked; 268 blocked = p->blocked;
269 collect_sigign_sigcatch(p, &ignored, &caught); 269 collect_sigign_sigcatch(p, &ignored, &caught);
270 num_threads = atomic_read(&p->signal->count); 270 num_threads = get_nr_threads(p);
271 rcu_read_lock(); /* FIXME: is this correct? */ 271 rcu_read_lock(); /* FIXME: is this correct? */
272 qsize = atomic_read(&__task_cred(p)->user->sigpending); 272 qsize = atomic_read(&__task_cred(p)->user->sigpending);
273 rcu_read_unlock(); 273 rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
410 tty_nr = new_encode_dev(tty_devnum(sig->tty)); 410 tty_nr = new_encode_dev(tty_devnum(sig->tty));
411 } 411 }
412 412
413 num_threads = atomic_read(&sig->count); 413 num_threads = get_nr_threads(task);
414 collect_sigign_sigcatch(task, &sigign, &sigcatch); 414 collect_sigign_sigcatch(task, &sigign, &sigcatch);
415 415
416 cmin_flt = sig->cmin_flt; 416 cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f9f23449dc..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
166 return result; 166 return result;
167} 167}
168 168
169static int get_nr_threads(struct task_struct *tsk)
170{
171 unsigned long flags;
172 int count = 0;
173
174 if (lock_task_sighand(tsk, &flags)) {
175 count = atomic_read(&tsk->signal->count);
176 unlock_task_sighand(tsk, &flags);
177 }
178 return count;
179}
180
181static int proc_cwd_link(struct inode *inode, struct path *path) 169static int proc_cwd_link(struct inode *inode, struct path *path)
182{ 170{
183 struct task_struct *task = get_proc_task(inode); 171 struct task_struct *task = get_proc_task(inode);
@@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2444 const struct pid_entry *p = ptr; 2432 const struct pid_entry *p = ptr;
2445 struct inode *inode; 2433 struct inode *inode;
2446 struct proc_inode *ei; 2434 struct proc_inode *ei;
2447 struct dentry *error = ERR_PTR(-EINVAL); 2435 struct dentry *error;
2448 2436
2449 /* Allocate the inode */ 2437 /* Allocate the inode */
2450 error = ERR_PTR(-ENOMEM); 2438 error = ERR_PTR(-ENOMEM);
@@ -2794,7 +2782,7 @@ out:
2794 2782
2795struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 2783struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2796{ 2784{
2797 struct dentry *result = ERR_PTR(-ENOENT); 2785 struct dentry *result;
2798 struct task_struct *task; 2786 struct task_struct *task;
2799 unsigned tgid; 2787 unsigned tgid;
2800 struct pid_namespace *ns; 2788 struct pid_namespace *ns;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
343/* 343/*
344 * Return an inode number between PROC_DYNAMIC_FIRST and 344 * Return an inode number between PROC_DYNAMIC_FIRST and
345 * 0xffffffff, or zero on failure. 345 * 0xffffffff, or zero on failure.
346 *
347 * Current inode allocations in the proc-fs (hex-numbers):
348 *
349 * 00000000 reserved
350 * 00000001-00000fff static entries (goners)
351 * 001 root-ino
352 *
353 * 00001000-00001fff unused
354 * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
355 * 80000000-efffffff unused
356 * f0000000-ffffffff dynamic entries
357 *
358 * Goal:
359 * Once we split the thing into several virtual filesystems,
360 * we will get rid of magical ranges (and this comment, BTW).
361 */ 346 */
362static unsigned int get_inode_number(void) 347static unsigned int get_inode_number(void)
363{ 348{
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c837a77351be..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -588,7 +588,7 @@ static struct kcore_list kcore_text;
588 */ 588 */
589static void __init proc_kcore_text_init(void) 589static void __init proc_kcore_text_init(void)
590{ 590{
591 kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); 591 kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
592} 592}
593#else 593#else
594static void __init proc_kcore_text_init(void) 594static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
110 if (err) 110 if (err)
111 return; 111 return;
112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); 112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
113 err = PTR_ERR(proc_mnt);
114 if (IS_ERR(proc_mnt)) { 113 if (IS_ERR(proc_mnt)) {
115 unregister_filesystem(&proc_fs_type); 114 unregister_filesystem(&proc_fs_type);
116 return; 115 return;
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
77 77
78const struct file_operations qnx4_dir_operations = 78const struct file_operations qnx4_dir_operations =
79{ 79{
80 .llseek = generic_file_llseek,
80 .read = generic_read_dir, 81 .read = generic_read_dir,
81 .readdir = qnx4_readdir, 82 .readdir = qnx4_readdir,
82 .fsync = simple_fsync, 83 .fsync = generic_file_fsync,
83}; 84};
84 85
85const struct inode_operations qnx4_dir_inode_operations = 86const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 655a4c52b8c3..12c233da1b6b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -228,10 +228,6 @@ static struct hlist_head *dquot_hash;
228 228
229struct dqstats dqstats; 229struct dqstats dqstats;
230EXPORT_SYMBOL(dqstats); 230EXPORT_SYMBOL(dqstats);
231#ifdef CONFIG_SMP
232struct dqstats *dqstats_pcpu;
233EXPORT_SYMBOL(dqstats_pcpu);
234#endif
235 231
236static qsize_t inode_get_rsv_space(struct inode *inode); 232static qsize_t inode_get_rsv_space(struct inode *inode);
237static void __dquot_initialize(struct inode *inode, int type); 233static void __dquot_initialize(struct inode *inode, int type);
@@ -584,7 +580,7 @@ out:
584} 580}
585EXPORT_SYMBOL(dquot_scan_active); 581EXPORT_SYMBOL(dquot_scan_active);
586 582
587int vfs_quota_sync(struct super_block *sb, int type, int wait) 583int dquot_quota_sync(struct super_block *sb, int type, int wait)
588{ 584{
589 struct list_head *dirty; 585 struct list_head *dirty;
590 struct dquot *dquot; 586 struct dquot *dquot;
@@ -656,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
656 652
657 return 0; 653 return 0;
658} 654}
659EXPORT_SYMBOL(vfs_quota_sync); 655EXPORT_SYMBOL(dquot_quota_sync);
660 656
661/* Free unused dquots from cache */ 657/* Free unused dquots from cache */
662static void prune_dqcache(int count) 658static void prune_dqcache(int count)
@@ -676,27 +672,10 @@ static void prune_dqcache(int count)
676 } 672 }
677} 673}
678 674
679static int dqstats_read(unsigned int type)
680{
681 int count = 0;
682#ifdef CONFIG_SMP
683 int cpu;
684 for_each_possible_cpu(cpu)
685 count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type];
686 /* Statistics reading is racy, but absolute accuracy isn't required */
687 if (count < 0)
688 count = 0;
689#else
690 count = dqstats.stat[type];
691#endif
692 return count;
693}
694
695/* 675/*
696 * This is called from kswapd when we think we need some 676 * This is called from kswapd when we think we need some
697 * more memory 677 * more memory
698 */ 678 */
699
700static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) 679static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
701{ 680{
702 if (nr) { 681 if (nr) {
@@ -704,7 +683,9 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
704 prune_dqcache(nr); 683 prune_dqcache(nr);
705 spin_unlock(&dq_list_lock); 684 spin_unlock(&dq_list_lock);
706 } 685 }
707 return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure; 686 return ((unsigned)
687 percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
688 /100) * sysctl_vfs_cache_pressure;
708} 689}
709 690
710static struct shrinker dqcache_shrinker = { 691static struct shrinker dqcache_shrinker = {
@@ -1514,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1514/* 1495/*
1515 * This operation can block, but only after everything is updated 1496 * This operation can block, but only after everything is updated
1516 */ 1497 */
1517int __dquot_alloc_space(struct inode *inode, qsize_t number, 1498int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1518 int warn, int reserve)
1519{ 1499{
1520 int cnt, ret = 0; 1500 int cnt, ret = 0;
1521 char warntype[MAXQUOTAS]; 1501 char warntype[MAXQUOTAS];
1502 int warn = flags & DQUOT_SPACE_WARN;
1503 int reserve = flags & DQUOT_SPACE_RESERVE;
1504 int nofail = flags & DQUOT_SPACE_NOFAIL;
1522 1505
1523 /* 1506 /*
1524 * First test before acquiring mutex - solves deadlocks when we 1507 * First test before acquiring mutex - solves deadlocks when we
@@ -1539,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1539 continue; 1522 continue;
1540 ret = check_bdq(inode->i_dquot[cnt], number, !warn, 1523 ret = check_bdq(inode->i_dquot[cnt], number, !warn,
1541 warntype+cnt); 1524 warntype+cnt);
1542 if (ret) { 1525 if (ret && !nofail) {
1543 spin_unlock(&dq_data_lock); 1526 spin_unlock(&dq_data_lock);
1544 goto out_flush_warn; 1527 goto out_flush_warn;
1545 } 1528 }
@@ -1638,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
1638/* 1621/*
1639 * This operation can block, but only after everything is updated 1622 * This operation can block, but only after everything is updated
1640 */ 1623 */
1641void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) 1624void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1642{ 1625{
1643 unsigned int cnt; 1626 unsigned int cnt;
1644 char warntype[MAXQUOTAS]; 1627 char warntype[MAXQUOTAS];
1628 int reserve = flags & DQUOT_SPACE_RESERVE;
1645 1629
1646 /* First test before acquiring mutex - solves deadlocks when we 1630 /* First test before acquiring mutex - solves deadlocks when we
1647 * re-enter the quota code and are already holding the mutex */ 1631 * re-enter the quota code and are already holding the mutex */
@@ -1812,7 +1796,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1812 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) 1796 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
1813 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA); 1797 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
1814 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) 1798 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
1815 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA); 1799 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
1816 1800
1817 ret = __dquot_transfer(inode, transfer_to); 1801 ret = __dquot_transfer(inode, transfer_to);
1818 dqput_all(transfer_to); 1802 dqput_all(transfer_to);
@@ -1847,6 +1831,7 @@ const struct dquot_operations dquot_operations = {
1847 .alloc_dquot = dquot_alloc, 1831 .alloc_dquot = dquot_alloc,
1848 .destroy_dquot = dquot_destroy, 1832 .destroy_dquot = dquot_destroy,
1849}; 1833};
1834EXPORT_SYMBOL(dquot_operations);
1850 1835
1851/* 1836/*
1852 * Generic helper for ->open on filesystems supporting disk quotas. 1837 * Generic helper for ->open on filesystems supporting disk quotas.
@@ -1865,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open);
1865/* 1850/*
1866 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1851 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1867 */ 1852 */
1868int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) 1853int dquot_disable(struct super_block *sb, int type, unsigned int flags)
1869{ 1854{
1870 int cnt, ret = 0; 1855 int cnt, ret = 0;
1871 struct quota_info *dqopt = sb_dqopt(sb); 1856 struct quota_info *dqopt = sb_dqopt(sb);
@@ -1995,14 +1980,15 @@ put_inodes:
1995 } 1980 }
1996 return ret; 1981 return ret;
1997} 1982}
1998EXPORT_SYMBOL(vfs_quota_disable); 1983EXPORT_SYMBOL(dquot_disable);
1999 1984
2000int vfs_quota_off(struct super_block *sb, int type, int remount) 1985int dquot_quota_off(struct super_block *sb, int type)
2001{ 1986{
2002 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : 1987 return dquot_disable(sb, type,
2003 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); 1988 DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
2004} 1989}
2005EXPORT_SYMBOL(vfs_quota_off); 1990EXPORT_SYMBOL(dquot_quota_off);
1991
2006/* 1992/*
2007 * Turn quotas on on a device 1993 * Turn quotas on on a device
2008 */ 1994 */
@@ -2120,36 +2106,43 @@ out_fmt:
2120} 2106}
2121 2107
2122/* Reenable quotas on remount RW */ 2108/* Reenable quotas on remount RW */
2123static int vfs_quota_on_remount(struct super_block *sb, int type) 2109int dquot_resume(struct super_block *sb, int type)
2124{ 2110{
2125 struct quota_info *dqopt = sb_dqopt(sb); 2111 struct quota_info *dqopt = sb_dqopt(sb);
2126 struct inode *inode; 2112 struct inode *inode;
2127 int ret; 2113 int ret = 0, cnt;
2128 unsigned int flags; 2114 unsigned int flags;
2129 2115
2130 mutex_lock(&dqopt->dqonoff_mutex); 2116 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2131 if (!sb_has_quota_suspended(sb, type)) { 2117 if (type != -1 && cnt != type)
2118 continue;
2119
2120 mutex_lock(&dqopt->dqonoff_mutex);
2121 if (!sb_has_quota_suspended(sb, cnt)) {
2122 mutex_unlock(&dqopt->dqonoff_mutex);
2123 continue;
2124 }
2125 inode = dqopt->files[cnt];
2126 dqopt->files[cnt] = NULL;
2127 spin_lock(&dq_state_lock);
2128 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2129 DQUOT_LIMITS_ENABLED,
2130 cnt);
2131 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
2132 spin_unlock(&dq_state_lock);
2132 mutex_unlock(&dqopt->dqonoff_mutex); 2133 mutex_unlock(&dqopt->dqonoff_mutex);
2133 return 0;
2134 }
2135 inode = dqopt->files[type];
2136 dqopt->files[type] = NULL;
2137 spin_lock(&dq_state_lock);
2138 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2139 DQUOT_LIMITS_ENABLED, type);
2140 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
2141 spin_unlock(&dq_state_lock);
2142 mutex_unlock(&dqopt->dqonoff_mutex);
2143 2134
2144 flags = dquot_generic_flag(flags, type); 2135 flags = dquot_generic_flag(flags, cnt);
2145 ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id, 2136 ret = vfs_load_quota_inode(inode, cnt,
2146 flags); 2137 dqopt->info[cnt].dqi_fmt_id, flags);
2147 iput(inode); 2138 iput(inode);
2139 }
2148 2140
2149 return ret; 2141 return ret;
2150} 2142}
2143EXPORT_SYMBOL(dquot_resume);
2151 2144
2152int vfs_quota_on_path(struct super_block *sb, int type, int format_id, 2145int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
2153 struct path *path) 2146 struct path *path)
2154{ 2147{
2155 int error = security_quota_on(path->dentry); 2148 int error = security_quota_on(path->dentry);
@@ -2164,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
2164 DQUOT_LIMITS_ENABLED); 2157 DQUOT_LIMITS_ENABLED);
2165 return error; 2158 return error;
2166} 2159}
2167EXPORT_SYMBOL(vfs_quota_on_path); 2160EXPORT_SYMBOL(dquot_quota_on_path);
2168 2161
2169int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, 2162int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
2170 int remount)
2171{ 2163{
2172 struct path path; 2164 struct path path;
2173 int error; 2165 int error;
2174 2166
2175 if (remount)
2176 return vfs_quota_on_remount(sb, type);
2177
2178 error = kern_path(name, LOOKUP_FOLLOW, &path); 2167 error = kern_path(name, LOOKUP_FOLLOW, &path);
2179 if (!error) { 2168 if (!error) {
2180 error = vfs_quota_on_path(sb, type, format_id, &path); 2169 error = dquot_quota_on_path(sb, type, format_id, &path);
2181 path_put(&path); 2170 path_put(&path);
2182 } 2171 }
2183 return error; 2172 return error;
2184} 2173}
2185EXPORT_SYMBOL(vfs_quota_on); 2174EXPORT_SYMBOL(dquot_quota_on);
2186 2175
2187/* 2176/*
2188 * More powerful function for turning on quotas allowing setting 2177 * More powerful function for turning on quotas allowing setting
2189 * of individual quota flags 2178 * of individual quota flags
2190 */ 2179 */
2191int vfs_quota_enable(struct inode *inode, int type, int format_id, 2180int dquot_enable(struct inode *inode, int type, int format_id,
2192 unsigned int flags) 2181 unsigned int flags)
2193{ 2182{
2194 int ret = 0; 2183 int ret = 0;
2195 struct super_block *sb = inode->i_sb; 2184 struct super_block *sb = inode->i_sb;
2196 struct quota_info *dqopt = sb_dqopt(sb); 2185 struct quota_info *dqopt = sb_dqopt(sb);
2197 2186
2198 /* Just unsuspend quotas? */ 2187 /* Just unsuspend quotas? */
2199 if (flags & DQUOT_SUSPENDED) 2188 BUG_ON(flags & DQUOT_SUSPENDED);
2200 return vfs_quota_on_remount(sb, type); 2189
2201 if (!flags) 2190 if (!flags)
2202 return 0; 2191 return 0;
2203 /* Just updating flags needed? */ 2192 /* Just updating flags needed? */
@@ -2229,13 +2218,13 @@ out_lock:
2229load_quota: 2218load_quota:
2230 return vfs_load_quota_inode(inode, type, format_id, flags); 2219 return vfs_load_quota_inode(inode, type, format_id, flags);
2231} 2220}
2232EXPORT_SYMBOL(vfs_quota_enable); 2221EXPORT_SYMBOL(dquot_enable);
2233 2222
2234/* 2223/*
2235 * This function is used when filesystem needs to initialize quotas 2224 * This function is used when filesystem needs to initialize quotas
2236 * during mount time. 2225 * during mount time.
2237 */ 2226 */
2238int vfs_quota_on_mount(struct super_block *sb, char *qf_name, 2227int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
2239 int format_id, int type) 2228 int format_id, int type)
2240{ 2229{
2241 struct dentry *dentry; 2230 struct dentry *dentry;
@@ -2261,24 +2250,7 @@ out:
2261 dput(dentry); 2250 dput(dentry);
2262 return error; 2251 return error;
2263} 2252}
2264EXPORT_SYMBOL(vfs_quota_on_mount); 2253EXPORT_SYMBOL(dquot_quota_on_mount);
2265
2266/* Wrapper to turn on quotas when remounting rw */
2267int vfs_dq_quota_on_remount(struct super_block *sb)
2268{
2269 int cnt;
2270 int ret = 0, err;
2271
2272 if (!sb->s_qcop || !sb->s_qcop->quota_on)
2273 return -ENOSYS;
2274 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2275 err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
2276 if (err < 0 && !ret)
2277 ret = err;
2278 }
2279 return ret;
2280}
2281EXPORT_SYMBOL(vfs_dq_quota_on_remount);
2282 2254
2283static inline qsize_t qbtos(qsize_t blocks) 2255static inline qsize_t qbtos(qsize_t blocks)
2284{ 2256{
@@ -2313,8 +2285,8 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2313 spin_unlock(&dq_data_lock); 2285 spin_unlock(&dq_data_lock);
2314} 2286}
2315 2287
2316int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, 2288int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
2317 struct fs_disk_quota *di) 2289 struct fs_disk_quota *di)
2318{ 2290{
2319 struct dquot *dquot; 2291 struct dquot *dquot;
2320 2292
@@ -2326,7 +2298,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2326 2298
2327 return 0; 2299 return 0;
2328} 2300}
2329EXPORT_SYMBOL(vfs_get_dqblk); 2301EXPORT_SYMBOL(dquot_get_dqblk);
2330 2302
2331#define VFS_FS_DQ_MASK \ 2303#define VFS_FS_DQ_MASK \
2332 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \ 2304 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
@@ -2425,7 +2397,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2425 return 0; 2397 return 0;
2426} 2398}
2427 2399
2428int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, 2400int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
2429 struct fs_disk_quota *di) 2401 struct fs_disk_quota *di)
2430{ 2402{
2431 struct dquot *dquot; 2403 struct dquot *dquot;
@@ -2441,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
2441out: 2413out:
2442 return rc; 2414 return rc;
2443} 2415}
2444EXPORT_SYMBOL(vfs_set_dqblk); 2416EXPORT_SYMBOL(dquot_set_dqblk);
2445 2417
2446/* Generic routine for getting common part of quota file information */ 2418/* Generic routine for getting common part of quota file information */
2447int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2419int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2448{ 2420{
2449 struct mem_dqinfo *mi; 2421 struct mem_dqinfo *mi;
2450 2422
@@ -2463,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2463 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2435 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2464 return 0; 2436 return 0;
2465} 2437}
2466EXPORT_SYMBOL(vfs_get_dqinfo); 2438EXPORT_SYMBOL(dquot_get_dqinfo);
2467 2439
2468/* Generic routine for setting common part of quota file information */ 2440/* Generic routine for setting common part of quota file information */
2469int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2441int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2470{ 2442{
2471 struct mem_dqinfo *mi; 2443 struct mem_dqinfo *mi;
2472 int err = 0; 2444 int err = 0;
@@ -2493,27 +2465,27 @@ out:
2493 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2465 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2494 return err; 2466 return err;
2495} 2467}
2496EXPORT_SYMBOL(vfs_set_dqinfo); 2468EXPORT_SYMBOL(dquot_set_dqinfo);
2497 2469
2498const struct quotactl_ops vfs_quotactl_ops = { 2470const struct quotactl_ops dquot_quotactl_ops = {
2499 .quota_on = vfs_quota_on, 2471 .quota_on = dquot_quota_on,
2500 .quota_off = vfs_quota_off, 2472 .quota_off = dquot_quota_off,
2501 .quota_sync = vfs_quota_sync, 2473 .quota_sync = dquot_quota_sync,
2502 .get_info = vfs_get_dqinfo, 2474 .get_info = dquot_get_dqinfo,
2503 .set_info = vfs_set_dqinfo, 2475 .set_info = dquot_set_dqinfo,
2504 .get_dqblk = vfs_get_dqblk, 2476 .get_dqblk = dquot_get_dqblk,
2505 .set_dqblk = vfs_set_dqblk 2477 .set_dqblk = dquot_set_dqblk
2506}; 2478};
2507 2479EXPORT_SYMBOL(dquot_quotactl_ops);
2508 2480
2509static int do_proc_dqstats(struct ctl_table *table, int write, 2481static int do_proc_dqstats(struct ctl_table *table, int write,
2510 void __user *buffer, size_t *lenp, loff_t *ppos) 2482 void __user *buffer, size_t *lenp, loff_t *ppos)
2511{ 2483{
2512#ifdef CONFIG_SMP
2513 /* Update global table */
2514 unsigned int type = (int *)table->data - dqstats.stat; 2484 unsigned int type = (int *)table->data - dqstats.stat;
2515 dqstats.stat[type] = dqstats_read(type); 2485
2516#endif 2486 /* Update global table */
2487 dqstats.stat[type] =
2488 percpu_counter_sum_positive(&dqstats.counter[type]);
2517 return proc_dointvec(table, write, buffer, lenp, ppos); 2489 return proc_dointvec(table, write, buffer, lenp, ppos);
2518} 2490}
2519 2491
@@ -2606,7 +2578,7 @@ static ctl_table sys_table[] = {
2606 2578
2607static int __init dquot_init(void) 2579static int __init dquot_init(void)
2608{ 2580{
2609 int i; 2581 int i, ret;
2610 unsigned long nr_hash, order; 2582 unsigned long nr_hash, order;
2611 2583
2612 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); 2584 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2624,12 +2596,11 @@ static int __init dquot_init(void)
2624 if (!dquot_hash) 2596 if (!dquot_hash)
2625 panic("Cannot create dquot hash table"); 2597 panic("Cannot create dquot hash table");
2626 2598
2627#ifdef CONFIG_SMP 2599 for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
2628 dqstats_pcpu = alloc_percpu(struct dqstats); 2600 ret = percpu_counter_init(&dqstats.counter[i], 0);
2629 if (!dqstats_pcpu) 2601 if (ret)
2630 panic("Cannot create dquot stats table"); 2602 panic("Cannot create dquot stat counters");
2631#endif 2603 }
2632 memset(&dqstats, 0, sizeof(struct dqstats));
2633 2604
2634 /* Find power-of-two hlist_heads which can fit into allocation */ 2605 /* Find power-of-two hlist_heads which can fit into allocation */
2635 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); 2606 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ce3dfd066f59..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -73,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
73 if (IS_ERR(pathname)) 73 if (IS_ERR(pathname))
74 return PTR_ERR(pathname); 74 return PTR_ERR(pathname);
75 if (sb->s_qcop->quota_on) 75 if (sb->s_qcop->quota_on)
76 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); 76 ret = sb->s_qcop->quota_on(sb, type, id, pathname);
77 putname(pathname); 77 putname(pathname);
78 return ret; 78 return ret;
79} 79}
@@ -260,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
260 case Q_QUOTAOFF: 260 case Q_QUOTAOFF:
261 if (!sb->s_qcop->quota_off) 261 if (!sb->s_qcop->quota_off)
262 return -ENOSYS; 262 return -ENOSYS;
263 return sb->s_qcop->quota_off(sb, type, 0); 263 return sb->s_qcop->quota_off(sb, type);
264 case Q_GETFMT: 264 case Q_GETFMT:
265 return quota_getfmt(sb, type, addr); 265 return quota_getfmt(sb, type, addr);
266 case Q_GETINFO: 266 case Q_GETINFO:
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
43 .write = do_sync_write, 43 .write = do_sync_write,
44 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
45 .mmap = generic_file_mmap, 45 .mmap = generic_file_mmap,
46 .fsync = simple_sync_file, 46 .fsync = noop_fsync,
47 .splice_read = generic_file_splice_read, 47 .splice_read = generic_file_splice_read,
48 .splice_write = generic_file_splice_write, 48 .splice_write = generic_file_splice_write,
49 .llseek = generic_file_llseek, 49 .llseek = generic_file_llseek,
50}; 50};
51 51
52const struct inode_operations ramfs_file_inode_operations = { 52const struct inode_operations ramfs_file_inode_operations = {
53 .setattr = simple_setattr,
53 .getattr = simple_getattr, 54 .getattr = simple_getattr,
54}; 55};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5ea4ad81a429..d532c20fc179 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
42 .aio_read = generic_file_aio_read, 42 .aio_read = generic_file_aio_read,
43 .write = do_sync_write, 43 .write = do_sync_write,
44 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
45 .fsync = simple_sync_file, 45 .fsync = noop_fsync,
46 .splice_read = generic_file_splice_read, 46 .splice_read = generic_file_splice_read,
47 .splice_write = generic_file_splice_write, 47 .splice_write = generic_file_splice_write,
48 .llseek = generic_file_llseek, 48 .llseek = generic_file_llseek,
@@ -146,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
146 return ret; 146 return ret;
147 } 147 }
148 148
149 ret = vmtruncate(inode, newsize); 149 ret = simple_setsize(inode, newsize);
150 150
151 return ret; 151 return ret;
152} 152}
@@ -169,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
169 169
170 /* pick out size-changing events */ 170 /* pick out size-changing events */
171 if (ia->ia_valid & ATTR_SIZE) { 171 if (ia->ia_valid & ATTR_SIZE) {
172 loff_t size = i_size_read(inode); 172 loff_t size = inode->i_size;
173
173 if (ia->ia_size != size) { 174 if (ia->ia_size != size) {
174 ret = ramfs_nommu_resize(inode, ia->ia_size, size); 175 ret = ramfs_nommu_resize(inode, ia->ia_size, size);
175 if (ret < 0 || ia->ia_valid == ATTR_SIZE) 176 if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -182,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
182 } 183 }
183 } 184 }
184 185
185 ret = inode_setattr(inode, ia); 186 generic_setattr(inode, ia);
186 out: 187 out:
187 ia->ia_valid = old_ia_valid; 188 ia->ia_valid = old_ia_valid;
188 return ret; 189 return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
97} 97}
98EXPORT_SYMBOL(generic_file_llseek); 98EXPORT_SYMBOL(generic_file_llseek);
99 99
100/**
101 * noop_llseek - No Operation Performed llseek implementation
102 * @file: file structure to seek on
103 * @offset: file offset to seek to
104 * @origin: type of seek
105 *
106 * This is an implementation of ->llseek useable for the rare special case when
107 * userspace expects the seek to succeed but the (device) file is actually not
108 * able to perform the seek. In this case you use noop_llseek() instead of
109 * falling back to the default implementation of ->llseek.
110 */
111loff_t noop_llseek(struct file *file, loff_t offset, int origin)
112{
113 return file->f_pos;
114}
115EXPORT_SYMBOL(noop_llseek);
116
100loff_t no_llseek(struct file *file, loff_t offset, int origin) 117loff_t no_llseek(struct file *file, loff_t offset, int origin)
101{ 118{
102 return -ESPIPE; 119 return -ESPIPE;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,10 +14,10 @@
14extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
15 15
16static int reiserfs_readdir(struct file *, void *, filldir_t); 16static int reiserfs_readdir(struct file *, void *, filldir_t);
17static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 17static int reiserfs_dir_fsync(struct file *filp, int datasync);
18 int datasync);
19 18
20const struct file_operations reiserfs_dir_operations = { 19const struct file_operations reiserfs_dir_operations = {
20 .llseek = generic_file_llseek,
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .readdir = reiserfs_readdir, 22 .readdir = reiserfs_readdir,
23 .fsync = reiserfs_dir_fsync, 23 .fsync = reiserfs_dir_fsync,
@@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
27#endif 27#endif
28}; 28};
29 29
30static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 30static int reiserfs_dir_fsync(struct file *filp, int datasync)
31 int datasync)
32{ 31{
33 struct inode *inode = dentry->d_inode; 32 struct inode *inode = filp->f_mapping->host;
34 int err; 33 int err;
35 reiserfs_write_lock(inode->i_sb); 34 reiserfs_write_lock(inode->i_sb);
36 err = reiserfs_commit_for_inode(inode); 35 err = reiserfs_commit_for_inode(inode);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9977df9f3a54..b82cdd8a45dd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
134 * be removed... 134 * be removed...
135 */ 135 */
136 136
137static int reiserfs_sync_file(struct file *filp, 137static int reiserfs_sync_file(struct file *filp, int datasync)
138 struct dentry *dentry, int datasync)
139{ 138{
140 struct inode *inode = dentry->d_inode; 139 struct inode *inode = filp->f_mapping->host;
141 int err; 140 int err;
142 int barrier_done; 141 int barrier_done;
143 142
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 59125fb36d42..9822fa15118b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
158#ifdef CONFIG_QUOTA 158#ifdef CONFIG_QUOTA
159 int i; 159 int i;
160 int ms_active_set; 160 int ms_active_set;
161 int quota_enabled[MAXQUOTAS];
161#endif 162#endif
162 163
163 /* compose key to look for "save" links */ 164 /* compose key to look for "save" links */
@@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
179 } 180 }
180 /* Turn on quotas so that they are updated correctly */ 181 /* Turn on quotas so that they are updated correctly */
181 for (i = 0; i < MAXQUOTAS; i++) { 182 for (i = 0; i < MAXQUOTAS; i++) {
183 quota_enabled[i] = 1;
182 if (REISERFS_SB(s)->s_qf_names[i]) { 184 if (REISERFS_SB(s)->s_qf_names[i]) {
183 int ret = reiserfs_quota_on_mount(s, i); 185 int ret;
186
187 if (sb_has_quota_active(s, i)) {
188 quota_enabled[i] = 0;
189 continue;
190 }
191 ret = reiserfs_quota_on_mount(s, i);
184 if (ret < 0) 192 if (ret < 0)
185 reiserfs_warning(s, "reiserfs-2500", 193 reiserfs_warning(s, "reiserfs-2500",
186 "cannot turn on journaled " 194 "cannot turn on journaled "
@@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
304#ifdef CONFIG_QUOTA 312#ifdef CONFIG_QUOTA
305 /* Turn quotas off */ 313 /* Turn quotas off */
306 for (i = 0; i < MAXQUOTAS; i++) { 314 for (i = 0; i < MAXQUOTAS; i++) {
307 if (sb_dqopt(s)->files[i]) 315 if (sb_dqopt(s)->files[i] && quota_enabled[i])
308 vfs_quota_off(s, i, 0); 316 dquot_quota_off(s, i);
309 } 317 }
310 if (ms_active_set) 318 if (ms_active_set)
311 /* Restore the flag back */ 319 /* Restore the flag back */
@@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
466 struct reiserfs_transaction_handle th; 474 struct reiserfs_transaction_handle th;
467 th.t_trans_id = 0; 475 th.t_trans_id = 0;
468 476
477 dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
478
469 reiserfs_write_lock(s); 479 reiserfs_write_lock(s);
470 480
471 if (s->s_dirt) 481 if (s->s_dirt)
@@ -620,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
620static int reiserfs_release_dquot(struct dquot *); 630static int reiserfs_release_dquot(struct dquot *);
621static int reiserfs_mark_dquot_dirty(struct dquot *); 631static int reiserfs_mark_dquot_dirty(struct dquot *);
622static int reiserfs_write_info(struct super_block *, int); 632static int reiserfs_write_info(struct super_block *, int);
623static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 633static int reiserfs_quota_on(struct super_block *, int, int, char *);
624 634
625static const struct dquot_operations reiserfs_quota_operations = { 635static const struct dquot_operations reiserfs_quota_operations = {
626 .write_dquot = reiserfs_write_dquot, 636 .write_dquot = reiserfs_write_dquot,
@@ -634,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
634 644
635static const struct quotactl_ops reiserfs_qctl_operations = { 645static const struct quotactl_ops reiserfs_qctl_operations = {
636 .quota_on = reiserfs_quota_on, 646 .quota_on = reiserfs_quota_on,
637 .quota_off = vfs_quota_off, 647 .quota_off = dquot_quota_off,
638 .quota_sync = vfs_quota_sync, 648 .quota_sync = dquot_quota_sync,
639 .get_info = vfs_get_dqinfo, 649 .get_info = dquot_get_dqinfo,
640 .set_info = vfs_set_dqinfo, 650 .set_info = dquot_set_dqinfo,
641 .get_dqblk = vfs_get_dqblk, 651 .get_dqblk = dquot_get_dqblk,
642 .set_dqblk = vfs_set_dqblk, 652 .set_dqblk = dquot_set_dqblk,
643}; 653};
644#endif 654#endif
645 655
@@ -1242,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1242 if (s->s_flags & MS_RDONLY) 1252 if (s->s_flags & MS_RDONLY)
1243 /* it is read-only already */ 1253 /* it is read-only already */
1244 goto out_ok; 1254 goto out_ok;
1255
1256 err = dquot_suspend(s, -1);
1257 if (err < 0)
1258 goto out_err;
1259
1245 /* try to remount file system with read-only permissions */ 1260 /* try to remount file system with read-only permissions */
1246 if (sb_umount_state(rs) == REISERFS_VALID_FS 1261 if (sb_umount_state(rs) == REISERFS_VALID_FS
1247 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { 1262 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1295,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1295 s->s_dirt = 0; 1310 s->s_dirt = 0;
1296 1311
1297 if (!(*mount_flags & MS_RDONLY)) { 1312 if (!(*mount_flags & MS_RDONLY)) {
1313 dquot_resume(s, -1);
1298 finish_unfinished(s); 1314 finish_unfinished(s);
1299 reiserfs_xattr_init(s, *mount_flags); 1315 reiserfs_xattr_init(s, *mount_flags);
1300 } 1316 }
@@ -2022,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
2022 */ 2038 */
2023static int reiserfs_quota_on_mount(struct super_block *sb, int type) 2039static int reiserfs_quota_on_mount(struct super_block *sb, int type)
2024{ 2040{
2025 return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], 2041 return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
2026 REISERFS_SB(sb)->s_jquota_fmt, type); 2042 REISERFS_SB(sb)->s_jquota_fmt, type);
2027} 2043}
2028 2044
2029/* 2045/*
2030 * Standard function to be called on quota_on 2046 * Standard function to be called on quota_on
2031 */ 2047 */
2032static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, 2048static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2033 char *name, int remount) 2049 char *name)
2034{ 2050{
2035 int err; 2051 int err;
2036 struct path path; 2052 struct path path;
@@ -2039,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2039 2055
2040 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) 2056 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
2041 return -EINVAL; 2057 return -EINVAL;
2042 /* No more checks needed? Path and format_id are bogus anyway... */ 2058
2043 if (remount)
2044 return vfs_quota_on(sb, type, format_id, name, 1);
2045 err = kern_path(name, LOOKUP_FOLLOW, &path); 2059 err = kern_path(name, LOOKUP_FOLLOW, &path);
2046 if (err) 2060 if (err)
2047 return err; 2061 return err;
@@ -2085,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2085 if (err) 2099 if (err)
2086 goto out; 2100 goto out;
2087 } 2101 }
2088 err = vfs_quota_on_path(sb, type, format_id, &path); 2102 err = dquot_quota_on_path(sb, type, format_id, &path);
2089out: 2103out:
2090 path_put(&path); 2104 path_put(&path);
2091 return err; 2105 return err;
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 6c978428892d..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,6 +37,7 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
37 37
38const struct file_operations smb_dir_operations = 38const struct file_operations smb_dir_operations =
39{ 39{
40 .llseek = generic_file_llseek,
40 .read = generic_read_dir, 41 .read = generic_read_dir,
41 .readdir = smb_readdir, 42 .readdir = smb_readdir,
42 .unlocked_ioctl = smb_ioctl, 43 .unlocked_ioctl = smb_ioctl,
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 84ecf0e43f91..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -28,8 +28,9 @@
28#include "proto.h" 28#include "proto.h"
29 29
30static int 30static int
31smb_fsync(struct file *file, struct dentry * dentry, int datasync) 31smb_fsync(struct file *file, int datasync)
32{ 32{
33 struct dentry *dentry = file->f_path.dentry;
33 struct smb_sb_info *server = server_from_dentry(dentry); 34 struct smb_sb_info *server = server_from_dentry(dentry);
34 int result; 35 int result;
35 36
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index dfa1d67f8fca..9551cb6f7fe4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -714,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
714 error = server->ops->truncate(inode, attr->ia_size); 714 error = server->ops->truncate(inode, attr->ia_size);
715 if (error) 715 if (error)
716 goto out; 716 goto out;
717 error = vmtruncate(inode, attr->ia_size); 717 error = simple_setsize(inode, attr->ia_size);
718 if (error) 718 if (error)
719 goto out; 719 goto out;
720 refresh = 1; 720 refresh = 1;
diff --git a/fs/splice.c b/fs/splice.c
index ac22b00d86c3..740e6b9faf7a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -354,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
354 break; 354 break;
355 355
356 error = add_to_page_cache_lru(page, mapping, index, 356 error = add_to_page_cache_lru(page, mapping, index,
357 mapping_gfp_mask(mapping)); 357 GFP_KERNEL);
358 if (unlikely(error)) { 358 if (unlikely(error)) {
359 page_cache_release(page); 359 page_cache_release(page);
360 if (error == -EEXIST) 360 if (error == -EEXIST)
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config SQUASHFS_XATTRS
30 bool "Squashfs XATTR support"
31 depends on SQUASHFS
32 default n
33 help
34 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by
36 the kernel or by users (see the attr(5) manual page).
37
38 If unsure, say N.
39
29config SQUASHFS_EMBEDDED 40config SQUASHFS_EMBEDDED
30 41
31 bool "Additional option for memory-constrained systems" 42 bool "Additional option for memory-constrained systems"
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
9
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/xattr.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
46#include "squashfs_fs_i.h" 47#include "squashfs_fs_i.h"
47#include "squashfs.h" 48#include "squashfs.h"
49#include "xattr.h"
48 50
49/* 51/*
50 * Initialise VFS inode with the base inode information common to all 52 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
111 int err, type, offset = SQUASHFS_INODE_OFFSET(ino); 113 int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
112 union squashfs_inode squashfs_ino; 114 union squashfs_inode squashfs_ino;
113 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; 115 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
116 int xattr_id = SQUASHFS_INVALID_XATTR;
114 117
115 TRACE("Entered squashfs_read_inode\n"); 118 TRACE("Entered squashfs_read_inode\n");
116 119
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
199 frag_offset = 0; 202 frag_offset = 0;
200 } 203 }
201 204
205 xattr_id = le32_to_cpu(sqsh_ino->xattr);
202 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 206 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
203 inode->i_size = le64_to_cpu(sqsh_ino->file_size); 207 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
208 inode->i_op = &squashfs_inode_ops;
204 inode->i_fop = &generic_ro_fops; 209 inode->i_fop = &generic_ro_fops;
205 inode->i_mode |= S_IFREG; 210 inode->i_mode |= S_IFREG;
206 inode->i_blocks = ((inode->i_size - 211 inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
251 if (err < 0) 256 if (err < 0)
252 goto failed_read; 257 goto failed_read;
253 258
259 xattr_id = le32_to_cpu(sqsh_ino->xattr);
254 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 260 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
255 inode->i_size = le32_to_cpu(sqsh_ino->file_size); 261 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
256 inode->i_op = &squashfs_dir_inode_ops; 262 inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
280 286
281 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 287 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
282 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); 288 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
283 inode->i_op = &page_symlink_inode_operations; 289 inode->i_op = &squashfs_symlink_inode_ops;
284 inode->i_data.a_ops = &squashfs_symlink_aops; 290 inode->i_data.a_ops = &squashfs_symlink_aops;
285 inode->i_mode |= S_IFLNK; 291 inode->i_mode |= S_IFLNK;
286 squashfs_i(inode)->start = block; 292 squashfs_i(inode)->start = block;
287 squashfs_i(inode)->offset = offset; 293 squashfs_i(inode)->offset = offset;
288 294
295 if (type == SQUASHFS_LSYMLINK_TYPE) {
296 __le32 xattr;
297
298 err = squashfs_read_metadata(sb, NULL, &block,
299 &offset, inode->i_size);
300 if (err < 0)
301 goto failed_read;
302 err = squashfs_read_metadata(sb, &xattr, &block,
303 &offset, sizeof(xattr));
304 if (err < 0)
305 goto failed_read;
306 xattr_id = le32_to_cpu(xattr);
307 }
308
289 TRACE("Symbolic link inode %x:%x, start_block %llx, offset " 309 TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
290 "%x\n", SQUASHFS_INODE_BLK(ino), offset, 310 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
291 block, offset); 311 block, offset);
292 break; 312 break;
293 } 313 }
294 case SQUASHFS_BLKDEV_TYPE: 314 case SQUASHFS_BLKDEV_TYPE:
295 case SQUASHFS_CHRDEV_TYPE: 315 case SQUASHFS_CHRDEV_TYPE: {
296 case SQUASHFS_LBLKDEV_TYPE:
297 case SQUASHFS_LCHRDEV_TYPE: {
298 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; 316 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
299 unsigned int rdev; 317 unsigned int rdev;
300 318
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
315 SQUASHFS_INODE_BLK(ino), offset, rdev); 333 SQUASHFS_INODE_BLK(ino), offset, rdev);
316 break; 334 break;
317 } 335 }
336 case SQUASHFS_LBLKDEV_TYPE:
337 case SQUASHFS_LCHRDEV_TYPE: {
338 struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
339 unsigned int rdev;
340
341 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
342 sizeof(*sqsh_ino));
343 if (err < 0)
344 goto failed_read;
345
346 if (type == SQUASHFS_LCHRDEV_TYPE)
347 inode->i_mode |= S_IFCHR;
348 else
349 inode->i_mode |= S_IFBLK;
350 xattr_id = le32_to_cpu(sqsh_ino->xattr);
351 inode->i_op = &squashfs_inode_ops;
352 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
353 rdev = le32_to_cpu(sqsh_ino->rdev);
354 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
355
356 TRACE("Device inode %x:%x, rdev %x\n",
357 SQUASHFS_INODE_BLK(ino), offset, rdev);
358 break;
359 }
318 case SQUASHFS_FIFO_TYPE: 360 case SQUASHFS_FIFO_TYPE:
319 case SQUASHFS_SOCKET_TYPE: 361 case SQUASHFS_SOCKET_TYPE: {
320 case SQUASHFS_LFIFO_TYPE:
321 case SQUASHFS_LSOCKET_TYPE: {
322 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; 362 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
323 363
324 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, 364 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
334 init_special_inode(inode, inode->i_mode, 0); 374 init_special_inode(inode, inode->i_mode, 0);
335 break; 375 break;
336 } 376 }
377 case SQUASHFS_LFIFO_TYPE:
378 case SQUASHFS_LSOCKET_TYPE: {
379 struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
380
381 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
382 sizeof(*sqsh_ino));
383 if (err < 0)
384 goto failed_read;
385
386 if (type == SQUASHFS_LFIFO_TYPE)
387 inode->i_mode |= S_IFIFO;
388 else
389 inode->i_mode |= S_IFSOCK;
390 xattr_id = le32_to_cpu(sqsh_ino->xattr);
391 inode->i_op = &squashfs_inode_ops;
392 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
393 init_special_inode(inode, inode->i_mode, 0);
394 break;
395 }
337 default: 396 default:
338 ERROR("Unknown inode type %d in squashfs_iget!\n", type); 397 ERROR("Unknown inode type %d in squashfs_iget!\n", type);
339 return -EINVAL; 398 return -EINVAL;
340 } 399 }
341 400
401 if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
402 err = squashfs_xattr_lookup(sb, xattr_id,
403 &squashfs_i(inode)->xattr_count,
404 &squashfs_i(inode)->xattr_size,
405 &squashfs_i(inode)->xattr);
406 if (err < 0)
407 goto failed_read;
408 inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
409 + 1;
410 } else
411 squashfs_i(inode)->xattr_count = 0;
412
342 return 0; 413 return 0;
343 414
344failed_read: 415failed_read:
345 ERROR("Unable to read inode 0x%llx\n", ino); 416 ERROR("Unable to read inode 0x%llx\n", ino);
346 return err; 417 return err;
347} 418}
419
420
421const struct inode_operations squashfs_inode_ops = {
422 .getxattr = generic_getxattr,
423 .listxattr = squashfs_listxattr
424};
425
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/xattr.h>
60 61
61#include "squashfs_fs.h" 62#include "squashfs_fs.h"
62#include "squashfs_fs_sb.h" 63#include "squashfs_fs_sb.h"
63#include "squashfs_fs_i.h" 64#include "squashfs_fs_i.h"
64#include "squashfs.h" 65#include "squashfs.h"
66#include "xattr.h"
65 67
66/* 68/*
67 * Lookup name in the directory index, returning the location of the metadata 69 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
237 239
238 240
239const struct inode_operations squashfs_dir_inode_ops = { 241const struct inode_operations squashfs_dir_inode_ops = {
240 .lookup = squashfs_lookup 242 .lookup = squashfs_lookup,
243 .getxattr = generic_getxattr,
244 .listxattr = squashfs_listxattr
241}; 245};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
73 unsigned int); 73 unsigned int);
74extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
75 75
76/* xattr.c */
77extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
78
76/* 79/*
77 * Inodes, files and decompressor operations 80 * Inodes, files, decompressor and xattr operations
78 */ 81 */
79 82
80/* dir.c */ 83/* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
86/* file.c */ 89/* file.c */
87extern const struct address_space_operations squashfs_aops; 90extern const struct address_space_operations squashfs_aops;
88 91
92/* inode.c */
93extern const struct inode_operations squashfs_inode_ops;
94
89/* namei.c */ 95/* namei.c */
90extern const struct inode_operations squashfs_dir_inode_ops; 96extern const struct inode_operations squashfs_dir_inode_ops;
91 97
92/* symlink.c */ 98/* symlink.c */
93extern const struct address_space_operations squashfs_symlink_aops; 99extern const struct address_space_operations squashfs_symlink_aops;
100extern const struct inode_operations squashfs_symlink_inode_ops;
101
102/* xattr.c */
103extern const struct xattr_handler *squashfs_xattr_handlers[];
94 104
95/* zlib_wrapper.c */ 105/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 106extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
46#define SQUASHFS_NAME_LEN 256 46#define SQUASHFS_NAME_LEN 256
47 47
48#define SQUASHFS_INVALID_FRAG (0xffffffffU) 48#define SQUASHFS_INVALID_FRAG (0xffffffffU)
49#define SQUASHFS_INVALID_XATTR (0xffffffffU)
49#define SQUASHFS_INVALID_BLK (-1LL) 50#define SQUASHFS_INVALID_BLK (-1LL)
50 51
51/* Filesystem flags */ 52/* Filesystem flags */
@@ -96,6 +97,13 @@
96#define SQUASHFS_LFIFO_TYPE 13 97#define SQUASHFS_LFIFO_TYPE 13
97#define SQUASHFS_LSOCKET_TYPE 14 98#define SQUASHFS_LSOCKET_TYPE 14
98 99
100/* Xattr types */
101#define SQUASHFS_XATTR_USER 0
102#define SQUASHFS_XATTR_TRUSTED 1
103#define SQUASHFS_XATTR_SECURITY 2
104#define SQUASHFS_XATTR_VALUE_OOL 256
105#define SQUASHFS_XATTR_PREFIX_MASK 0xff
106
99/* Flag whether block is compressed or uncompressed, bit is set if block is 107/* Flag whether block is compressed or uncompressed, bit is set if block is
100 * uncompressed */ 108 * uncompressed */
101#define SQUASHFS_COMPRESSED_BIT (1 << 15) 109#define SQUASHFS_COMPRESSED_BIT (1 << 15)
@@ -174,6 +182,24 @@
174 182
175#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ 183#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
176 sizeof(u64)) 184 sizeof(u64))
185/* xattr id lookup table defines */
186#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
187
188#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
189 SQUASHFS_METADATA_SIZE)
190
191#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \
192 SQUASHFS_METADATA_SIZE)
193
194#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \
195 SQUASHFS_METADATA_SIZE - 1) / \
196 SQUASHFS_METADATA_SIZE)
197
198#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\
199 sizeof(u64))
200#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16))
201
202#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff))
177 203
178/* cached data constants for filesystem */ 204/* cached data constants for filesystem */
179#define SQUASHFS_CACHED_BLKS 8 205#define SQUASHFS_CACHED_BLKS 8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
228 __le64 root_inode; 254 __le64 root_inode;
229 __le64 bytes_used; 255 __le64 bytes_used;
230 __le64 id_table_start; 256 __le64 id_table_start;
231 __le64 xattr_table_start; 257 __le64 xattr_id_table_start;
232 __le64 inode_table_start; 258 __le64 inode_table_start;
233 __le64 directory_table_start; 259 __le64 directory_table_start;
234 __le64 fragment_table_start; 260 __le64 fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
261 __le32 nlink; 287 __le32 nlink;
262}; 288};
263 289
290struct squashfs_lipc_inode {
291 __le16 inode_type;
292 __le16 mode;
293 __le16 uid;
294 __le16 guid;
295 __le32 mtime;
296 __le32 inode_number;
297 __le32 nlink;
298 __le32 xattr;
299};
300
264struct squashfs_dev_inode { 301struct squashfs_dev_inode {
265 __le16 inode_type; 302 __le16 inode_type;
266 __le16 mode; 303 __le16 mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
272 __le32 rdev; 309 __le32 rdev;
273}; 310};
274 311
312struct squashfs_ldev_inode {
313 __le16 inode_type;
314 __le16 mode;
315 __le16 uid;
316 __le16 guid;
317 __le32 mtime;
318 __le32 inode_number;
319 __le32 nlink;
320 __le32 rdev;
321 __le32 xattr;
322};
323
275struct squashfs_symlink_inode { 324struct squashfs_symlink_inode {
276 __le16 inode_type; 325 __le16 inode_type;
277 __le16 mode; 326 __le16 mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
349union squashfs_inode { 398union squashfs_inode {
350 struct squashfs_base_inode base; 399 struct squashfs_base_inode base;
351 struct squashfs_dev_inode dev; 400 struct squashfs_dev_inode dev;
401 struct squashfs_ldev_inode ldev;
352 struct squashfs_symlink_inode symlink; 402 struct squashfs_symlink_inode symlink;
353 struct squashfs_reg_inode reg; 403 struct squashfs_reg_inode reg;
354 struct squashfs_lreg_inode lreg; 404 struct squashfs_lreg_inode lreg;
355 struct squashfs_dir_inode dir; 405 struct squashfs_dir_inode dir;
356 struct squashfs_ldir_inode ldir; 406 struct squashfs_ldir_inode ldir;
357 struct squashfs_ipc_inode ipc; 407 struct squashfs_ipc_inode ipc;
408 struct squashfs_lipc_inode lipc;
358}; 409};
359 410
360struct squashfs_dir_entry { 411struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
377 unsigned int unused; 428 unsigned int unused;
378}; 429};
379 430
431struct squashfs_xattr_entry {
432 __le16 type;
433 __le16 size;
434 char data[0];
435};
436
437struct squashfs_xattr_val {
438 __le32 vsize;
439 char value[0];
440};
441
442struct squashfs_xattr_id {
443 __le64 xattr;
444 __le32 count;
445 __le32 size;
446};
447
448struct squashfs_xattr_id_table {
449 __le64 xattr_table_start;
450 __le32 xattr_ids;
451 __le32 unused;
452};
453
380#endif 454#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
26struct squashfs_inode_info { 26struct squashfs_inode_info {
27 u64 start; 27 u64 start;
28 int offset; 28 int offset;
29 u64 xattr;
30 unsigned int xattr_size;
31 int xattr_count;
29 union { 32 union {
30 struct { 33 struct {
31 u64 fragment_block; 34 u64 fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
61 int next_meta_index; 61 int next_meta_index;
62 __le64 *id_table; 62 __le64 *id_table;
63 __le64 *fragment_index; 63 __le64 *fragment_index;
64 __le64 *xattr_id_table;
64 struct mutex read_data_mutex; 65 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 66 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 67 struct meta_index *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
68 __le64 *inode_lookup_table; 69 __le64 *inode_lookup_table;
69 u64 inode_table; 70 u64 inode_table;
70 u64 directory_table; 71 u64 directory_table;
72 u64 xattr_table;
71 unsigned int block_size; 73 unsigned int block_size;
72 unsigned short block_log; 74 unsigned short block_log;
73 long long bytes_used; 75 long long bytes_used;
74 unsigned int inodes; 76 unsigned int inodes;
77 int xattr_ids;
75}; 78};
76#endif 79#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/magic.h> 38#include <linux/magic.h>
39#include <linux/xattr.h>
39 40
40#include "squashfs_fs.h" 41#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 42#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h" 43#include "squashfs_fs_i.h"
43#include "squashfs.h" 44#include "squashfs.h"
44#include "decompressor.h" 45#include "decompressor.h"
46#include "xattr.h"
45 47
46static struct file_system_type squashfs_fs_type; 48static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 49static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
82 long long root_inode; 84 long long root_inode;
83 unsigned short flags; 85 unsigned short flags;
84 unsigned int fragments; 86 unsigned int fragments;
85 u64 lookup_table_start; 87 u64 lookup_table_start, xattr_id_table_start;
86 int err; 88 int err;
87 89
88 TRACE("Entered squashfs_fill_superblock\n"); 90 TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
139 if (msblk->decompressor == NULL) 141 if (msblk->decompressor == NULL)
140 goto failed_mount; 142 goto failed_mount;
141 143
142 /*
143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored.
145 */
146 if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
147 ERROR("Xattrs in filesystem, these will be ignored\n");
148
149 /* Check the filesystem does not extend beyond the end of the 144 /* Check the filesystem does not extend beyond the end of the
150 block device */ 145 block device */
151 msblk->bytes_used = le64_to_cpu(sblk->bytes_used); 146 msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
253allocate_lookup_table: 248allocate_lookup_table:
254 lookup_table_start = le64_to_cpu(sblk->lookup_table_start); 249 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
255 if (lookup_table_start == SQUASHFS_INVALID_BLK) 250 if (lookup_table_start == SQUASHFS_INVALID_BLK)
256 goto allocate_root; 251 goto allocate_xattr_table;
257 252
258 /* Allocate and read inode lookup table */ 253 /* Allocate and read inode lookup table */
259 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, 254 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
266 261
267 sb->s_export_op = &squashfs_export_ops; 262 sb->s_export_op = &squashfs_export_ops;
268 263
264allocate_xattr_table:
265 sb->s_xattr = squashfs_xattr_handlers;
266 xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
267 if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
268 goto allocate_root;
269
270 /* Allocate and read xattr id lookup table */
271 msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
272 xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
273 if (IS_ERR(msblk->xattr_id_table)) {
274 err = PTR_ERR(msblk->xattr_id_table);
275 msblk->xattr_id_table = NULL;
276 if (err != -ENOTSUPP)
277 goto failed_mount;
278 }
269allocate_root: 279allocate_root:
270 root = new_inode(sb); 280 root = new_inode(sb);
271 if (!root) { 281 if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
301 kfree(msblk->inode_lookup_table); 311 kfree(msblk->inode_lookup_table);
302 kfree(msblk->fragment_index); 312 kfree(msblk->fragment_index);
303 kfree(msblk->id_table); 313 kfree(msblk->id_table);
314 kfree(msblk->xattr_id_table);
304 kfree(sb->s_fs_info); 315 kfree(sb->s_fs_info);
305 sb->s_fs_info = NULL; 316 sb->s_fs_info = NULL;
306 kfree(sblk); 317 kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
355 kfree(sbi->fragment_index); 366 kfree(sbi->fragment_index);
356 kfree(sbi->meta_index); 367 kfree(sbi->meta_index);
357 kfree(sbi->inode_lookup_table); 368 kfree(sbi->inode_lookup_table);
369 kfree(sbi->xattr_id_table);
358 kfree(sb->s_fs_info); 370 kfree(sb->s_fs_info);
359 sb->s_fs_info = NULL; 371 sb->s_fs_info = NULL;
360 } 372 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/string.h> 36#include <linux/string.h>
37#include <linux/pagemap.h> 37#include <linux/pagemap.h>
38#include <linux/xattr.h>
38 39
39#include "squashfs_fs.h" 40#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
42#include "squashfs.h" 43#include "squashfs.h"
44#include "xattr.h"
43 45
44static int squashfs_symlink_readpage(struct file *file, struct page *page) 46static int squashfs_symlink_readpage(struct file *file, struct page *page)
45{ 47{
@@ -114,3 +116,12 @@ error_out:
114const struct address_space_operations squashfs_symlink_aops = { 116const struct address_space_operations squashfs_symlink_aops = {
115 .readpage = squashfs_symlink_readpage 117 .readpage = squashfs_symlink_readpage
116}; 118};
119
120const struct inode_operations squashfs_symlink_inode_ops = {
121 .readlink = generic_readlink,
122 .follow_link = page_follow_link_light,
123 .put_link = page_put_link,
124 .getxattr = generic_getxattr,
125 .listxattr = squashfs_listxattr
126};
127
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24#include <linux/init.h>
25#include <linux/module.h>
26#include <linux/string.h>
27#include <linux/fs.h>
28#include <linux/vfs.h>
29#include <linux/xattr.h>
30#include <linux/slab.h>
31
32#include "squashfs_fs.h"
33#include "squashfs_fs_sb.h"
34#include "squashfs_fs_i.h"
35#include "squashfs.h"
36
37static const struct xattr_handler *squashfs_xattr_handler(int);
38
39ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
40 size_t buffer_size)
41{
42 struct inode *inode = d->d_inode;
43 struct super_block *sb = inode->i_sb;
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
46 + msblk->xattr_table;
47 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
48 int count = squashfs_i(inode)->xattr_count;
49 size_t rest = buffer_size;
50 int err;
51
52 /* check that the file system has xattrs */
53 if (msblk->xattr_id_table == NULL)
54 return -EOPNOTSUPP;
55
56 /* loop reading each xattr name */
57 while (count--) {
58 struct squashfs_xattr_entry entry;
59 struct squashfs_xattr_val val;
60 const struct xattr_handler *handler;
61 int name_size, prefix_size = 0;
62
63 err = squashfs_read_metadata(sb, &entry, &start, &offset,
64 sizeof(entry));
65 if (err < 0)
66 goto failed;
67
68 name_size = le16_to_cpu(entry.size);
69 handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
70 if (handler)
71 prefix_size = handler->list(d, buffer, rest, NULL,
72 name_size, handler->flags);
73 if (prefix_size) {
74 if (buffer) {
75 if (prefix_size + name_size + 1 > rest) {
76 err = -ERANGE;
77 goto failed;
78 }
79 buffer += prefix_size;
80 }
81 err = squashfs_read_metadata(sb, buffer, &start,
82 &offset, name_size);
83 if (err < 0)
84 goto failed;
85 if (buffer) {
86 buffer[name_size] = '\0';
87 buffer += name_size + 1;
88 }
89 rest -= prefix_size + name_size + 1;
90 } else {
91 /* no handler or insuffficient privileges, so skip */
92 err = squashfs_read_metadata(sb, NULL, &start,
93 &offset, name_size);
94 if (err < 0)
95 goto failed;
96 }
97
98
99 /* skip remaining xattr entry */
100 err = squashfs_read_metadata(sb, &val, &start, &offset,
101 sizeof(val));
102 if (err < 0)
103 goto failed;
104
105 err = squashfs_read_metadata(sb, NULL, &start, &offset,
106 le32_to_cpu(val.vsize));
107 if (err < 0)
108 goto failed;
109 }
110 err = buffer_size - rest;
111
112failed:
113 return err;
114}
115
116
117static int squashfs_xattr_get(struct inode *inode, int name_index,
118 const char *name, void *buffer, size_t buffer_size)
119{
120 struct super_block *sb = inode->i_sb;
121 struct squashfs_sb_info *msblk = sb->s_fs_info;
122 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
123 + msblk->xattr_table;
124 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
125 int count = squashfs_i(inode)->xattr_count;
126 int name_len = strlen(name);
127 int err, vsize;
128 char *target = kmalloc(name_len, GFP_KERNEL);
129
130 if (target == NULL)
131 return -ENOMEM;
132
133 /* loop reading each xattr name */
134 for (; count; count--) {
135 struct squashfs_xattr_entry entry;
136 struct squashfs_xattr_val val;
137 int type, prefix, name_size;
138
139 err = squashfs_read_metadata(sb, &entry, &start, &offset,
140 sizeof(entry));
141 if (err < 0)
142 goto failed;
143
144 name_size = le16_to_cpu(entry.size);
145 type = le16_to_cpu(entry.type);
146 prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
147
148 if (prefix == name_index && name_size == name_len)
149 err = squashfs_read_metadata(sb, target, &start,
150 &offset, name_size);
151 else
152 err = squashfs_read_metadata(sb, NULL, &start,
153 &offset, name_size);
154 if (err < 0)
155 goto failed;
156
157 if (prefix == name_index && name_size == name_len &&
158 strncmp(target, name, name_size) == 0) {
159 /* found xattr */
160 if (type & SQUASHFS_XATTR_VALUE_OOL) {
161 __le64 xattr;
162 /* val is a reference to the real location */
163 err = squashfs_read_metadata(sb, &val, &start,
164 &offset, sizeof(val));
165 if (err < 0)
166 goto failed;
167 err = squashfs_read_metadata(sb, &xattr, &start,
168 &offset, sizeof(xattr));
169 if (err < 0)
170 goto failed;
171 xattr = le64_to_cpu(xattr);
172 start = SQUASHFS_XATTR_BLK(xattr) +
173 msblk->xattr_table;
174 offset = SQUASHFS_XATTR_OFFSET(xattr);
175 }
176 /* read xattr value */
177 err = squashfs_read_metadata(sb, &val, &start, &offset,
178 sizeof(val));
179 if (err < 0)
180 goto failed;
181
182 vsize = le32_to_cpu(val.vsize);
183 if (buffer) {
184 if (vsize > buffer_size) {
185 err = -ERANGE;
186 goto failed;
187 }
188 err = squashfs_read_metadata(sb, buffer, &start,
189 &offset, vsize);
190 if (err < 0)
191 goto failed;
192 }
193 break;
194 }
195
196 /* no match, skip remaining xattr entry */
197 err = squashfs_read_metadata(sb, &val, &start, &offset,
198 sizeof(val));
199 if (err < 0)
200 goto failed;
201 err = squashfs_read_metadata(sb, NULL, &start, &offset,
202 le32_to_cpu(val.vsize));
203 if (err < 0)
204 goto failed;
205 }
206 err = count ? vsize : -ENODATA;
207
208failed:
209 kfree(target);
210 return err;
211}
212
213
214/*
215 * User namespace support
216 */
217static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
218 const char *name, size_t name_len, int type)
219{
220 if (list && XATTR_USER_PREFIX_LEN <= list_size)
221 memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
222 return XATTR_USER_PREFIX_LEN;
223}
224
225static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
226 size_t size, int type)
227{
228 if (name[0] == '\0')
229 return -EINVAL;
230
231 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
232 buffer, size);
233}
234
235static const struct xattr_handler squashfs_xattr_user_handler = {
236 .prefix = XATTR_USER_PREFIX,
237 .list = squashfs_user_list,
238 .get = squashfs_user_get
239};
240
241/*
242 * Trusted namespace support
243 */
244static size_t squashfs_trusted_list(struct dentry *d, char *list,
245 size_t list_size, const char *name, size_t name_len, int type)
246{
247 if (!capable(CAP_SYS_ADMIN))
248 return 0;
249
250 if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
251 memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
252 return XATTR_TRUSTED_PREFIX_LEN;
253}
254
255static int squashfs_trusted_get(struct dentry *d, const char *name,
256 void *buffer, size_t size, int type)
257{
258 if (name[0] == '\0')
259 return -EINVAL;
260
261 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
262 buffer, size);
263}
264
265static const struct xattr_handler squashfs_xattr_trusted_handler = {
266 .prefix = XATTR_TRUSTED_PREFIX,
267 .list = squashfs_trusted_list,
268 .get = squashfs_trusted_get
269};
270
271/*
272 * Security namespace support
273 */
274static size_t squashfs_security_list(struct dentry *d, char *list,
275 size_t list_size, const char *name, size_t name_len, int type)
276{
277 if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
278 memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
279 return XATTR_SECURITY_PREFIX_LEN;
280}
281
282static int squashfs_security_get(struct dentry *d, const char *name,
283 void *buffer, size_t size, int type)
284{
285 if (name[0] == '\0')
286 return -EINVAL;
287
288 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
289 buffer, size);
290}
291
292static const struct xattr_handler squashfs_xattr_security_handler = {
293 .prefix = XATTR_SECURITY_PREFIX,
294 .list = squashfs_security_list,
295 .get = squashfs_security_get
296};
297
298static inline const struct xattr_handler *squashfs_xattr_handler(int type)
299{
300 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
301 /* ignore unrecognised type */
302 return NULL;
303
304 switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
305 case SQUASHFS_XATTR_USER:
306 return &squashfs_xattr_user_handler;
307 case SQUASHFS_XATTR_TRUSTED:
308 return &squashfs_xattr_trusted_handler;
309 case SQUASHFS_XATTR_SECURITY:
310 return &squashfs_xattr_security_handler;
311 default:
312 /* ignore unrecognised type */
313 return NULL;
314 }
315}
316
317const struct xattr_handler *squashfs_xattr_handlers[] = {
318 &squashfs_xattr_user_handler,
319 &squashfs_xattr_trusted_handler,
320 &squashfs_xattr_security_handler,
321 NULL
322};
323
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr.h
22 */
23
24#ifdef CONFIG_SQUASHFS_XATTRS
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
28 int *, unsigned long long *);
29#else
30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
31 u64 start, u64 *xattr_table_start, int *xattr_ids)
32{
33 ERROR("Xattrs in filesystem, these will be ignored\n");
34 return ERR_PTR(-ENOTSUPP);
35}
36
37static inline int squashfs_xattr_lookup(struct super_block *sb,
38 unsigned int index, int *count, int *size,
39 unsigned long long *xattr)
40{
41 return 0;
42}
43#define squashfs_listxattr NULL
44#define generic_getxattr NULL
45#define squashfs_xattr_handlers NULL
46#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24/*
25 * This file implements code to map the 32-bit xattr id stored in the inode
26 * into the on disk location of the xattr data.
27 */
28
29#include <linux/fs.h>
30#include <linux/vfs.h>
31#include <linux/slab.h>
32
33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h"
37
38/*
39 * Map xattr id using the xattr id look up table
40 */
41int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
42 int *count, unsigned int *size, unsigned long long *xattr)
43{
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 int block = SQUASHFS_XATTR_BLOCK(index);
46 int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
47 u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
48 struct squashfs_xattr_id id;
49 int err;
50
51 err = squashfs_read_metadata(sb, &id, &start_block, &offset,
52 sizeof(id));
53 if (err < 0)
54 return err;
55
56 *xattr = le64_to_cpu(id.xattr);
57 *size = le32_to_cpu(id.size);
58 *count = le32_to_cpu(id.count);
59 return 0;
60}
61
62
63/*
64 * Read uncompressed xattr id lookup table indexes from disk into memory
65 */
66__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
67 u64 *xattr_table_start, int *xattr_ids)
68{
69 unsigned int len;
70 __le64 *xid_table;
71 struct squashfs_xattr_id_table id_table;
72 int err;
73
74 err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
75 if (err < 0) {
76 ERROR("unable to read xattr id table\n");
77 return ERR_PTR(err);
78 }
79 *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
80 *xattr_ids = le32_to_cpu(id_table.xattr_ids);
81 len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
82
83 TRACE("In read_xattr_index_table, length %d\n", len);
84
85 /* Allocate xattr id lookup table indexes */
86 xid_table = kmalloc(len, GFP_KERNEL);
87 if (xid_table == NULL) {
88 ERROR("Failed to allocate xattr id index table\n");
89 return ERR_PTR(-ENOMEM);
90 }
91
92 err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
93 if (err < 0) {
94 ERROR("unable to read xattr id index table\n");
95 kfree(xid_table);
96 return ERR_PTR(err);
97 }
98
99 return xid_table;
100}
diff --git a/fs/super.c b/fs/super.c
index 69688b15f1fa..5c35bc7a499e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,7 +24,6 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/acct.h> 25#include <linux/acct.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/quotaops.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
29#include <linux/security.h> 28#include <linux/security.h>
30#include <linux/writeback.h> /* for the emergency remount stuff */ 29#include <linux/writeback.h> /* for the emergency remount stuff */
@@ -94,8 +93,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
94 init_rwsem(&s->s_dquot.dqptr_sem); 93 init_rwsem(&s->s_dquot.dqptr_sem);
95 init_waitqueue_head(&s->s_wait_unfrozen); 94 init_waitqueue_head(&s->s_wait_unfrozen);
96 s->s_maxbytes = MAX_NON_LFS; 95 s->s_maxbytes = MAX_NON_LFS;
97 s->dq_op = sb_dquot_ops;
98 s->s_qcop = sb_quotactl_ops;
99 s->s_op = &default_op; 96 s->s_op = &default_op;
100 s->s_time_gran = 1000000000; 97 s->s_time_gran = 1000000000;
101 } 98 }
@@ -160,7 +157,6 @@ void deactivate_locked_super(struct super_block *s)
160{ 157{
161 struct file_system_type *fs = s->s_type; 158 struct file_system_type *fs = s->s_type;
162 if (atomic_dec_and_test(&s->s_active)) { 159 if (atomic_dec_and_test(&s->s_active)) {
163 vfs_dq_off(s, 0);
164 fs->kill_sb(s); 160 fs->kill_sb(s);
165 put_filesystem(fs); 161 put_filesystem(fs);
166 put_super(s); 162 put_super(s);
@@ -524,7 +520,7 @@ rescan:
524int do_remount_sb(struct super_block *sb, int flags, void *data, int force) 520int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
525{ 521{
526 int retval; 522 int retval;
527 int remount_rw, remount_ro; 523 int remount_ro;
528 524
529 if (sb->s_frozen != SB_UNFROZEN) 525 if (sb->s_frozen != SB_UNFROZEN)
530 return -EBUSY; 526 return -EBUSY;
@@ -540,7 +536,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
540 sync_filesystem(sb); 536 sync_filesystem(sb);
541 537
542 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 538 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
543 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
544 539
545 /* If we are remounting RDONLY and current sb is read/write, 540 /* If we are remounting RDONLY and current sb is read/write,
546 make sure there are no rw files opened */ 541 make sure there are no rw files opened */
@@ -549,9 +544,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
549 mark_files_ro(sb); 544 mark_files_ro(sb);
550 else if (!fs_may_remount_ro(sb)) 545 else if (!fs_may_remount_ro(sb))
551 return -EBUSY; 546 return -EBUSY;
552 retval = vfs_dq_off(sb, 1);
553 if (retval < 0 && retval != -ENOSYS)
554 return -EBUSY;
555 } 547 }
556 548
557 if (sb->s_op->remount_fs) { 549 if (sb->s_op->remount_fs) {
@@ -560,8 +552,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
560 return retval; 552 return retval;
561 } 553 }
562 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 554 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
563 if (remount_rw) 555
564 vfs_dq_quota_on_remount(sb);
565 /* 556 /*
566 * Some filesystems modify their metadata via some other path than the 557 * Some filesystems modify their metadata via some other path than the
567 * bdev buffer cache (eg. use a private mapping, or directories in 558 * bdev buffer cache (eg. use a private mapping, or directories in
@@ -946,8 +937,8 @@ out:
946EXPORT_SYMBOL_GPL(vfs_kern_mount); 937EXPORT_SYMBOL_GPL(vfs_kern_mount);
947 938
948/** 939/**
949 * freeze_super -- lock the filesystem and force it into a consistent state 940 * freeze_super - lock the filesystem and force it into a consistent state
950 * @super: the super to lock 941 * @sb: the super to lock
951 * 942 *
952 * Syncs the super to make sure the filesystem is consistent and calls the fs's 943 * Syncs the super to make sure the filesystem is consistent and calls the fs's
953 * freeze_fs. Subsequent calls to this without first thawing the fs will return 944 * freeze_fs. Subsequent calls to this without first thawing the fs will return
diff --git a/fs/sync.c b/fs/sync.c
index e8cbd415e50a..15aa6f03b2da 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
42 if (wait) 42 if (wait)
43 sync_inodes_sb(sb); 43 sync_inodes_sb(sb);
44 else 44 else
45 writeback_inodes_sb_locked(sb); 45 writeback_inodes_sb(sb);
46 46
47 if (sb->s_op->sync_fs) 47 if (sb->s_op->sync_fs)
48 sb->s_op->sync_fs(sb, wait); 48 sb->s_op->sync_fs(sb, wait);
@@ -130,12 +130,10 @@ void emergency_sync(void)
130 130
131/* 131/*
132 * Generic function to fsync a file. 132 * Generic function to fsync a file.
133 *
134 * filp may be NULL if called via the msync of a vma.
135 */ 133 */
136int file_fsync(struct file *filp, struct dentry *dentry, int datasync) 134int file_fsync(struct file *filp, int datasync)
137{ 135{
138 struct inode * inode = dentry->d_inode; 136 struct inode *inode = filp->f_mapping->host;
139 struct super_block * sb; 137 struct super_block * sb;
140 int ret, err; 138 int ret, err;
141 139
@@ -183,7 +181,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
183 * livelocks in fsync_buffers_list(). 181 * livelocks in fsync_buffers_list().
184 */ 182 */
185 mutex_lock(&mapping->host->i_mutex); 183 mutex_lock(&mapping->host->i_mutex);
186 err = file->f_op->fsync(file, file->f_path.dentry, datasync); 184 err = file->f_op->fsync(file, datasync);
187 if (!ret) 185 if (!ret)
188 ret = err; 186 ret = err;
189 mutex_unlock(&mapping->host->i_mutex); 187 mutex_unlock(&mapping->host->i_mutex);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index bbd77e95cf7f..0835a3b70e03 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,13 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
117 if (error) 117 if (error)
118 goto out; 118 goto out;
119 119
120 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ 120 error = sysfs_sd_setattr(sd, iattr);
121
122 error = inode_setattr(inode, iattr);
123 if (error) 121 if (error)
124 goto out; 122 goto out;
125 123
126 error = sysfs_sd_setattr(sd, iattr); 124 /* this ignores size changes */
125 generic_setattr(inode, iattr);
126
127out: 127out:
128 mutex_unlock(&sysfs_mutex); 128 mutex_unlock(&sysfs_mutex);
129 return error; 129 return error;
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 1dabed286b4c..79941e4964a4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
24 .llseek = generic_file_llseek, 24 .llseek = generic_file_llseek,
25 .read = generic_read_dir, 25 .read = generic_read_dir,
26 .readdir = sysv_readdir, 26 .readdir = sysv_readdir,
27 .fsync = simple_fsync, 27 .fsync = generic_file_fsync,
28}; 28};
29 29
30static inline void dir_put_page(struct page *page) 30static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..750cc22349bd 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
26 .write = do_sync_write, 26 .write = do_sync_write,
27 .aio_write = generic_file_aio_write, 27 .aio_write = generic_file_aio_write,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .splice_read = generic_file_splice_read, 30 .splice_read = generic_file_splice_read,
31}; 31};
32 32
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..d4a5380b5669 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
43 * then attach current time stamp. 43 * then attach current time stamp.
44 * But if the filesystem was marked clean, keep it clean. 44 * But if the filesystem was marked clean, keep it clean.
45 */ 45 */
46 sb->s_dirt = 0;
46 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); 47 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
47 if (sbi->s_type == FSTYPE_SYSV4) { 48 if (sbi->s_type == FSTYPE_SYSV4) {
48 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) 49 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf72b807..12f445cee9f7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -967,12 +967,15 @@ static int do_writepage(struct page *page, int len)
967 * the page locked, and it locks @ui_mutex. However, write-back does take inode 967 * the page locked, and it locks @ui_mutex. However, write-back does take inode
968 * @i_mutex, which means other VFS operations may be run on this inode at the 968 * @i_mutex, which means other VFS operations may be run on this inode at the
969 * same time. And the problematic one is truncation to smaller size, from where 969 * same time. And the problematic one is truncation to smaller size, from where
970 * we have to call 'vmtruncate()', which first changes @inode->i_size, then 970 * we have to call 'simple_setsize()', which first changes @inode->i_size, then
971 * drops the truncated pages. And while dropping the pages, it takes the page 971 * drops the truncated pages. And while dropping the pages, it takes the page
972 * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with 972 * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This 973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
974 * means that @inode->i_size is changed while @ui_mutex is unlocked. 974 * means that @inode->i_size is changed while @ui_mutex is unlocked.
975 * 975 *
976 * XXX: with the new truncate the above is not true anymore, the simple_setsize
977 * calls can be replaced with the individual components.
978 *
976 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond 979 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
977 * inode size. How do we do this if @inode->i_size may became smaller while we 980 * inode size. How do we do this if @inode->i_size may became smaller while we
978 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the 981 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1125,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
1125 budgeted = 0; 1128 budgeted = 0;
1126 } 1129 }
1127 1130
1128 err = vmtruncate(inode, new_size); 1131 err = simple_setsize(inode, new_size);
1129 if (err) 1132 if (err)
1130 goto out_budg; 1133 goto out_budg;
1131 1134
@@ -1214,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1214 1217
1215 if (attr->ia_valid & ATTR_SIZE) { 1218 if (attr->ia_valid & ATTR_SIZE) {
1216 dbg_gen("size %lld -> %lld", inode->i_size, new_size); 1219 dbg_gen("size %lld -> %lld", inode->i_size, new_size);
1217 err = vmtruncate(inode, new_size); 1220 err = simple_setsize(inode, new_size);
1218 if (err) 1221 if (err)
1219 goto out; 1222 goto out;
1220 } 1223 }
@@ -1223,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1223 if (attr->ia_valid & ATTR_SIZE) { 1226 if (attr->ia_valid & ATTR_SIZE) {
1224 /* Truncation changes inode [mc]time */ 1227 /* Truncation changes inode [mc]time */
1225 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1228 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1226 /* 'vmtruncate()' changed @i_size, update @ui_size */ 1229 /* 'simple_setsize()' changed @i_size, update @ui_size */
1227 ui->ui_size = inode->i_size; 1230 ui->ui_size = inode->i_size;
1228 } 1231 }
1229 1232
@@ -1304,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
1304 return NULL; 1307 return NULL;
1305} 1308}
1306 1309
1307int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1310int ubifs_fsync(struct file *file, int datasync)
1308{ 1311{
1309 struct inode *inode = dentry->d_inode; 1312 struct inode *inode = file->f_mapping->host;
1310 struct ubifs_info *c = inode->i_sb->s_fs_info; 1313 struct ubifs_info *c = inode->i_sb->s_fs_info;
1311 int err; 1314 int err;
1312 1315
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bd2542dad014..2eef553d50c8 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb {
379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses 379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot 380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
381 * make sure @inode->i_size is always changed under @ui_mutex, because it 381 * make sure @inode->i_size is always changed under @ui_mutex, because it
382 * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock 382 * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are 383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one 384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
385 * could consider to rework locking and base it on "shadow" fields. 385 * could consider to rework locking and base it on "shadow" fields.
@@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1678int ubifs_calc_dark(const struct ubifs_info *c, int spc); 1678int ubifs_calc_dark(const struct ubifs_info *c, int spc);
1679 1679
1680/* file.c */ 1680/* file.c */
1681int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); 1681int ubifs_fsync(struct file *file, int datasync);
1682int ubifs_setattr(struct dentry *dentry, struct iattr *attr); 1682int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
1683 1683
1684/* dir.c */ 1684/* dir.c */
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 9a9378b4eb5a..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
21 21
22#include "udfdecl.h" 22#include "udfdecl.h"
23 23
24#include <linux/quotaops.h>
25#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
26#include <linux/bitops.h> 25#include <linux/bitops.h>
27 26
@@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
159 udf_debug("byte=%2x\n", 158 udf_debug("byte=%2x\n",
160 ((char *)bh->b_data)[(bit + i) >> 3]); 159 ((char *)bh->b_data)[(bit + i) >> 3]);
161 } else { 160 } else {
162 if (inode)
163 dquot_free_block(inode, 1);
164 udf_add_free_space(sb, sbi->s_partition, 1); 161 udf_add_free_space(sb, sbi->s_partition, 1);
165 } 162 }
166 } 163 }
@@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
210 bit = block % (sb->s_blocksize << 3); 207 bit = block % (sb->s_blocksize << 3);
211 208
212 while (bit < (sb->s_blocksize << 3) && block_count > 0) { 209 while (bit < (sb->s_blocksize << 3) && block_count > 0) {
213 if (!udf_test_bit(bit, bh->b_data)) 210 if (!udf_clear_bit(bit, bh->b_data))
214 goto out; 211 goto out;
215 else if (dquot_prealloc_block(inode, 1))
216 goto out;
217 else if (!udf_clear_bit(bit, bh->b_data)) {
218 udf_debug("bit already cleared for block %d\n", bit);
219 dquot_free_block(inode, 1);
220 goto out;
221 }
222 block_count--; 212 block_count--;
223 alloc_count++; 213 alloc_count++;
224 bit++; 214 bit++;
@@ -338,20 +328,6 @@ search_back:
338 } 328 }
339 329
340got_block: 330got_block:
341
342 /*
343 * Check quota for allocation of this block.
344 */
345 if (inode) {
346 int ret = dquot_alloc_block(inode, 1);
347
348 if (ret) {
349 mutex_unlock(&sbi->s_alloc_mutex);
350 *err = ret;
351 return 0;
352 }
353 }
354
355 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - 331 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
356 (sizeof(struct spaceBitmapDesc) << 3); 332 (sizeof(struct spaceBitmapDesc) << 3);
357 333
@@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
401 } 377 }
402 378
403 iinfo = UDF_I(table); 379 iinfo = UDF_I(table);
404 /* We do this up front - There are some error conditions that
405 could occure, but.. oh well */
406 if (inode)
407 dquot_free_block(inode, count);
408 udf_add_free_space(sb, sbi->s_partition, count); 380 udf_add_free_space(sb, sbi->s_partition, count);
409 381
410 start = bloc->logicalBlockNum + offset; 382 start = bloc->logicalBlockNum + offset;
@@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
649 epos.offset -= adsize; 621 epos.offset -= adsize;
650 622
651 alloc_count = (elen >> sb->s_blocksize_bits); 623 alloc_count = (elen >> sb->s_blocksize_bits);
652 if (inode && dquot_prealloc_block(inode, 624 if (alloc_count > block_count) {
653 alloc_count > block_count ? block_count : alloc_count))
654 alloc_count = 0;
655 else if (alloc_count > block_count) {
656 alloc_count = block_count; 625 alloc_count = block_count;
657 eloc.logicalBlockNum += alloc_count; 626 eloc.logicalBlockNum += alloc_count;
658 elen -= (alloc_count << sb->s_blocksize_bits); 627 elen -= (alloc_count << sb->s_blocksize_bits);
@@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
752 newblock = goal_eloc.logicalBlockNum; 721 newblock = goal_eloc.logicalBlockNum;
753 goal_eloc.logicalBlockNum++; 722 goal_eloc.logicalBlockNum++;
754 goal_elen -= sb->s_blocksize; 723 goal_elen -= sb->s_blocksize;
755 if (inode) {
756 *err = dquot_alloc_block(inode, 1);
757 if (*err) {
758 brelse(goal_epos.bh);
759 mutex_unlock(&sbi->s_alloc_mutex);
760 return 0;
761 }
762 }
763 724
764 if (goal_elen) 725 if (goal_elen)
765 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); 726 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 3a84455c2a77..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
207 207
208/* readdir and lookup functions */ 208/* readdir and lookup functions */
209const struct file_operations udf_dir_operations = { 209const struct file_operations udf_dir_operations = {
210 .llseek = generic_file_llseek,
210 .read = generic_read_dir, 211 .read = generic_read_dir,
211 .readdir = udf_readdir, 212 .readdir = udf_readdir,
212 .unlocked_ioctl = udf_ioctl, 213 .unlocked_ioctl = udf_ioctl,
213 .fsync = simple_fsync, 214 .fsync = generic_file_fsync,
214}; 215};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index baae3a723946..94e06d6bddbd 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,6 @@
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h> 35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
38#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
39#include <linux/aio.h> 38#include <linux/aio.h>
40#include <linux/smp_lock.h> 39#include <linux/smp_lock.h>
@@ -219,39 +218,16 @@ const struct file_operations udf_file_operations = {
219 .read = do_sync_read, 218 .read = do_sync_read,
220 .aio_read = generic_file_aio_read, 219 .aio_read = generic_file_aio_read,
221 .unlocked_ioctl = udf_ioctl, 220 .unlocked_ioctl = udf_ioctl,
222 .open = dquot_file_open, 221 .open = generic_file_open,
223 .mmap = generic_file_mmap, 222 .mmap = generic_file_mmap,
224 .write = do_sync_write, 223 .write = do_sync_write,
225 .aio_write = udf_file_aio_write, 224 .aio_write = udf_file_aio_write,
226 .release = udf_release_file, 225 .release = udf_release_file,
227 .fsync = simple_fsync, 226 .fsync = generic_file_fsync,
228 .splice_read = generic_file_splice_read, 227 .splice_read = generic_file_splice_read,
229 .llseek = generic_file_llseek, 228 .llseek = generic_file_llseek,
230}; 229};
231 230
232int udf_setattr(struct dentry *dentry, struct iattr *iattr)
233{
234 struct inode *inode = dentry->d_inode;
235 int error;
236
237 error = inode_change_ok(inode, iattr);
238 if (error)
239 return error;
240
241 if (is_quota_modification(inode, iattr))
242 dquot_initialize(inode);
243
244 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
245 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
246 error = dquot_transfer(inode, iattr);
247 if (error)
248 return error;
249 }
250
251 return inode_setattr(inode, iattr);
252}
253
254const struct inode_operations udf_file_inode_operations = { 231const struct inode_operations udf_file_inode_operations = {
255 .truncate = udf_truncate, 232 .truncate = udf_truncate,
256 .setattr = udf_setattr,
257}; 233};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 2b5586c7f02a..18cd7111185d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
20 20
21#include "udfdecl.h" 21#include "udfdecl.h"
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include <linux/sched.h> 23#include <linux/sched.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26 25
@@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode)
32 struct super_block *sb = inode->i_sb; 31 struct super_block *sb = inode->i_sb;
33 struct udf_sb_info *sbi = UDF_SB(sb); 32 struct udf_sb_info *sbi = UDF_SB(sb);
34 33
35 /*
36 * Note: we must free any quota before locking the superblock,
37 * as writing the quota to disk may need the lock as well.
38 */
39 dquot_free_inode(inode);
40 dquot_drop(inode);
41
42 clear_inode(inode); 34 clear_inode(inode);
43 35
44 mutex_lock(&sbi->s_alloc_mutex); 36 mutex_lock(&sbi->s_alloc_mutex);
@@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
61 struct super_block *sb = dir->i_sb; 53 struct super_block *sb = dir->i_sb;
62 struct udf_sb_info *sbi = UDF_SB(sb); 54 struct udf_sb_info *sbi = UDF_SB(sb);
63 struct inode *inode; 55 struct inode *inode;
64 int block, ret; 56 int block;
65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; 57 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
66 struct udf_inode_info *iinfo; 58 struct udf_inode_info *iinfo;
67 struct udf_inode_info *dinfo = UDF_I(dir); 59 struct udf_inode_info *dinfo = UDF_I(dir);
@@ -146,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
146 insert_inode_hash(inode); 138 insert_inode_hash(inode);
147 mark_inode_dirty(inode); 139 mark_inode_dirty(inode);
148 140
149 dquot_initialize(inode);
150 ret = dquot_alloc_inode(inode);
151 if (ret) {
152 dquot_drop(inode);
153 inode->i_flags |= S_NOQUOTA;
154 inode->i_nlink = 0;
155 iput(inode);
156 *err = ret;
157 return NULL;
158 }
159
160 *err = 0; 141 *err = 0;
161 return inode; 142 return inode;
162} 143}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8a3fbd177cab..124852bcf6fe 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/quotaops.h>
40#include <linux/slab.h> 39#include <linux/slab.h>
41#include <linux/crc-itu-t.h> 40#include <linux/crc-itu-t.h>
42 41
@@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
71 70
72void udf_delete_inode(struct inode *inode) 71void udf_delete_inode(struct inode *inode)
73{ 72{
74 if (!is_bad_inode(inode))
75 dquot_initialize(inode);
76
77 truncate_inode_pages(&inode->i_data, 0); 73 truncate_inode_pages(&inode->i_data, 0);
78 74
79 if (is_bad_inode(inode)) 75 if (is_bad_inode(inode))
@@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode)
113 (unsigned long long)iinfo->i_lenExtents); 109 (unsigned long long)iinfo->i_lenExtents);
114 } 110 }
115 111
116 dquot_drop(inode);
117 kfree(iinfo->i_ext.i_data); 112 kfree(iinfo->i_ext.i_data);
118 iinfo->i_ext.i_data = NULL; 113 iinfo->i_ext.i_data = NULL;
119} 114}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 585f733615dc..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/quotaops.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
33#include <linux/sched.h> 32#include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
563 int err; 562 int err;
564 struct udf_inode_info *iinfo; 563 struct udf_inode_info *iinfo;
565 564
566 dquot_initialize(dir);
567
568 lock_kernel(); 565 lock_kernel();
569 inode = udf_new_inode(dir, mode, &err); 566 inode = udf_new_inode(dir, mode, &err);
570 if (!inode) { 567 if (!inode) {
@@ -617,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
617 if (!old_valid_dev(rdev)) 614 if (!old_valid_dev(rdev))
618 return -EINVAL; 615 return -EINVAL;
619 616
620 dquot_initialize(dir);
621
622 lock_kernel(); 617 lock_kernel();
623 err = -EIO; 618 err = -EIO;
624 inode = udf_new_inode(dir, mode, &err); 619 inode = udf_new_inode(dir, mode, &err);
@@ -664,8 +659,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
664 struct udf_inode_info *dinfo = UDF_I(dir); 659 struct udf_inode_info *dinfo = UDF_I(dir);
665 struct udf_inode_info *iinfo; 660 struct udf_inode_info *iinfo;
666 661
667 dquot_initialize(dir);
668
669 lock_kernel(); 662 lock_kernel();
670 err = -EMLINK; 663 err = -EMLINK;
671 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 664 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -800,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
800 struct fileIdentDesc *fi, cfi; 793 struct fileIdentDesc *fi, cfi;
801 struct kernel_lb_addr tloc; 794 struct kernel_lb_addr tloc;
802 795
803 dquot_initialize(dir);
804
805 retval = -ENOENT; 796 retval = -ENOENT;
806 lock_kernel(); 797 lock_kernel();
807 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 798 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -848,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
848 struct fileIdentDesc cfi; 839 struct fileIdentDesc cfi;
849 struct kernel_lb_addr tloc; 840 struct kernel_lb_addr tloc;
850 841
851 dquot_initialize(dir);
852
853 retval = -ENOENT; 842 retval = -ENOENT;
854 lock_kernel(); 843 lock_kernel();
855 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 844 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -904,8 +893,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
904 struct buffer_head *bh; 893 struct buffer_head *bh;
905 struct udf_inode_info *iinfo; 894 struct udf_inode_info *iinfo;
906 895
907 dquot_initialize(dir);
908
909 lock_kernel(); 896 lock_kernel();
910 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); 897 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
911 if (!inode) 898 if (!inode)
@@ -1075,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1075 int err; 1062 int err;
1076 struct buffer_head *bh; 1063 struct buffer_head *bh;
1077 1064
1078 dquot_initialize(dir);
1079
1080 lock_kernel(); 1065 lock_kernel();
1081 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1066 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1082 unlock_kernel(); 1067 unlock_kernel();
@@ -1139,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1139 struct kernel_lb_addr tloc; 1124 struct kernel_lb_addr tloc;
1140 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1125 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1141 1126
1142 dquot_initialize(old_dir);
1143 dquot_initialize(new_dir);
1144
1145 lock_kernel(); 1127 lock_kernel();
1146 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1128 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1147 if (ofi) { 1129 if (ofi) {
@@ -1387,7 +1369,6 @@ const struct export_operations udf_export_ops = {
1387const struct inode_operations udf_dir_inode_operations = { 1369const struct inode_operations udf_dir_inode_operations = {
1388 .lookup = udf_lookup, 1370 .lookup = udf_lookup,
1389 .create = udf_create, 1371 .create = udf_create,
1390 .setattr = udf_setattr,
1391 .link = udf_link, 1372 .link = udf_link,
1392 .unlink = udf_unlink, 1373 .unlink = udf_unlink,
1393 .symlink = udf_symlink, 1374 .symlink = udf_symlink,
@@ -1400,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = {
1400 .readlink = generic_readlink, 1381 .readlink = generic_readlink,
1401 .follow_link = page_follow_link_light, 1382 .follow_link = page_follow_link_light,
1402 .put_link = page_put_link, 1383 .put_link = page_put_link,
1403 .setattr = udf_setattr,
1404}; 1384};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..612d1e2e285a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
557{ 557{
558 struct udf_options uopt; 558 struct udf_options uopt;
559 struct udf_sb_info *sbi = UDF_SB(sb); 559 struct udf_sb_info *sbi = UDF_SB(sb);
560 int error = 0;
560 561
561 uopt.flags = sbi->s_flags; 562 uopt.flags = sbi->s_flags;
562 uopt.uid = sbi->s_uid; 563 uopt.uid = sbi->s_uid;
@@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
582 *flags |= MS_RDONLY; 583 *flags |= MS_RDONLY;
583 } 584 }
584 585
585 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 586 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
586 unlock_kernel(); 587 goto out_unlock;
587 return 0; 588
588 }
589 if (*flags & MS_RDONLY) 589 if (*flags & MS_RDONLY)
590 udf_close_lvid(sb); 590 udf_close_lvid(sb);
591 else 591 else
592 udf_open_lvid(sb); 592 udf_open_lvid(sb);
593 593
594out_unlock:
594 unlock_kernel(); 595 unlock_kernel();
595 return 0; 596 return error;
596} 597}
597 598
598/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ 599/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1939 /* Fill in the rest of the superblock */ 1940 /* Fill in the rest of the superblock */
1940 sb->s_op = &udf_sb_ops; 1941 sb->s_op = &udf_sb_ops;
1941 sb->s_export_op = &udf_export_ops; 1942 sb->s_export_op = &udf_export_ops;
1942 sb->dq_op = NULL; 1943
1943 sb->s_dirt = 0; 1944 sb->s_dirt = 0;
1944 sb->s_magic = UDF_SUPER_MAGIC; 1945 sb->s_magic = UDF_SUPER_MAGIC;
1945 sb->s_time_gran = 1000; 1946 sb->s_time_gran = 1000;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 9079ff7d6255..2bac0354891f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -131,7 +131,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
131 131
132/* file.c */ 132/* file.c */
133extern long udf_ioctl(struct file *, unsigned int, unsigned long); 133extern long udf_ioctl(struct file *, unsigned int, unsigned long);
134extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
135/* inode.c */ 134/* inode.c */
136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 135extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
137extern int udf_sync_inode(struct inode *); 136extern int udf_sync_inode(struct inode *);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..048484fb10d2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
12#include <linux/stat.h> 12#include <linux/stat.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/quotaops.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
85 "bit already cleared for fragment %u", i); 84 "bit already cleared for fragment %u", i);
86 } 85 }
87 86
88 dquot_free_block(inode, count);
89
90
91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 87 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
92 uspi->cs_total.cs_nffree += count; 88 uspi->cs_total.cs_nffree += count;
93 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 89 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -195,7 +191,6 @@ do_more:
195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 191 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 192 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
197 ufs_clusteracct (sb, ucpi, blkno, 1); 193 ufs_clusteracct (sb, ucpi, blkno, 1);
198 dquot_free_block(inode, uspi->s_fpb);
199 194
200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 195 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
201 uspi->cs_total.cs_nbfree++; 196 uspi->cs_total.cs_nbfree++;
@@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
511 struct ufs_cg_private_info * ucpi; 506 struct ufs_cg_private_info * ucpi;
512 struct ufs_cylinder_group * ucg; 507 struct ufs_cylinder_group * ucg;
513 unsigned cgno, fragno, fragoff, count, fragsize, i; 508 unsigned cgno, fragno, fragoff, count, fragsize, i;
514 int ret;
515 509
516 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", 510 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
517 (unsigned long long)fragment, oldcount, newcount); 511 (unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
557 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 551 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
558 for (i = oldcount; i < newcount; i++) 552 for (i = oldcount; i < newcount; i++)
559 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); 553 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
560 ret = dquot_alloc_block(inode, count);
561 if (ret) {
562 *err = ret;
563 return 0;
564 }
565 554
566 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); 555 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
567 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 556 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
598 struct ufs_cylinder_group * ucg; 587 struct ufs_cylinder_group * ucg;
599 unsigned oldcg, i, j, k, allocsize; 588 unsigned oldcg, i, j, k, allocsize;
600 u64 result; 589 u64 result;
601 int ret;
602 590
603 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", 591 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
604 inode->i_ino, cgno, (unsigned long long)goal, count); 592 inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +655,6 @@ cg_found:
667 for (i = count; i < uspi->s_fpb; i++) 655 for (i = count; i < uspi->s_fpb; i++)
668 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); 656 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
669 i = uspi->s_fpb - count; 657 i = uspi->s_fpb - count;
670 dquot_free_block(inode, i);
671 658
672 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 659 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
673 uspi->cs_total.cs_nffree += i; 660 uspi->cs_total.cs_nffree += i;
@@ -679,11 +666,6 @@ cg_found:
679 result = ufs_bitmap_search (sb, ucpi, goal, allocsize); 666 result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
680 if (result == INVBLOCK) 667 if (result == INVBLOCK)
681 return 0; 668 return 0;
682 ret = dquot_alloc_block(inode, count);
683 if (ret) {
684 *err = ret;
685 return 0;
686 }
687 for (i = 0; i < count; i++) 669 for (i = 0; i < count; i++)
688 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); 670 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
689 671
@@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
718 struct ufs_super_block_first * usb1; 700 struct ufs_super_block_first * usb1;
719 struct ufs_cylinder_group * ucg; 701 struct ufs_cylinder_group * ucg;
720 u64 result, blkno; 702 u64 result, blkno;
721 int ret;
722 703
723 UFSD("ENTER, goal %llu\n", (unsigned long long)goal); 704 UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
724 705
@@ -752,11 +733,6 @@ gotit:
752 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 733 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
753 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 734 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
754 ufs_clusteracct (sb, ucpi, blkno, -1); 735 ufs_clusteracct (sb, ucpi, blkno, -1);
755 ret = dquot_alloc_block(inode, uspi->s_fpb);
756 if (ret) {
757 *err = ret;
758 return INVBLOCK;
759 }
760 736
761 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); 737 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
762 uspi->cs_total.cs_nbfree--; 738 uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..ec784756dc65 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
666const struct file_operations ufs_dir_operations = { 666const struct file_operations ufs_dir_operations = {
667 .read = generic_read_dir, 667 .read = generic_read_dir,
668 .readdir = ufs_readdir, 668 .readdir = ufs_readdir,
669 .fsync = simple_fsync, 669 .fsync = generic_file_fsync,
670 .llseek = generic_file_llseek, 670 .llseek = generic_file_llseek,
671}; 671};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/quotaops.h>
28 27
29#include "ufs_fs.h" 28#include "ufs_fs.h"
30#include "ufs.h" 29#include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
41 .write = do_sync_write, 40 .write = do_sync_write,
42 .aio_write = generic_file_aio_write, 41 .aio_write = generic_file_aio_write,
43 .mmap = generic_file_mmap, 42 .mmap = generic_file_mmap,
44 .open = dquot_file_open, 43 .open = generic_file_open,
45 .fsync = simple_fsync, 44 .fsync = generic_file_fsync,
46 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
47}; 46};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3a959d55084d..594480e537d2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/string.h> 29#include <linux/string.h>
30#include <linux/quotaops.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/sched.h> 31#include <linux/sched.h>
33#include <linux/bitops.h> 32#include <linux/bitops.h>
@@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode)
95 94
96 is_directory = S_ISDIR(inode->i_mode); 95 is_directory = S_ISDIR(inode->i_mode);
97 96
98 dquot_free_inode(inode);
99 dquot_drop(inode);
100
101 clear_inode (inode); 97 clear_inode (inode);
102 98
103 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) 99 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
@@ -347,21 +343,12 @@ cg_found:
347 343
348 unlock_super (sb); 344 unlock_super (sb);
349 345
350 dquot_initialize(inode);
351 err = dquot_alloc_inode(inode);
352 if (err) {
353 dquot_drop(inode);
354 goto fail_without_unlock;
355 }
356
357 UFSD("allocating inode %lu\n", inode->i_ino); 346 UFSD("allocating inode %lu\n", inode->i_ino);
358 UFSD("EXIT\n"); 347 UFSD("EXIT\n");
359 return inode; 348 return inode;
360 349
361fail_remove_inode: 350fail_remove_inode:
362 unlock_super(sb); 351 unlock_super(sb);
363fail_without_unlock:
364 inode->i_flags |= S_NOQUOTA;
365 inode->i_nlink = 0; 352 inode->i_nlink = 0;
366 iput(inode); 353 iput(inode);
367 UFSD("EXIT (FAILED): err %d\n", err); 354 UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index cffa756f1047..73fe773aa034 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
37#include <linux/smp_lock.h> 37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40#include <linux/quotaops.h>
41 40
42#include "ufs_fs.h" 41#include "ufs_fs.h"
43#include "ufs.h" 42#include "ufs.h"
@@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode)
910{ 909{
911 loff_t old_i_size; 910 loff_t old_i_size;
912 911
913 if (!is_bad_inode(inode))
914 dquot_initialize(inode);
915
916 truncate_inode_pages(&inode->i_data, 0); 912 truncate_inode_pages(&inode->i_data, 0);
917 if (is_bad_inode(inode)) 913 if (is_bad_inode(inode))
918 goto no_delete; 914 goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index eabc02eb1294..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
33#include <linux/quotaops.h>
34 33
35#include "ufs_fs.h" 34#include "ufs_fs.h"
36#include "ufs.h" 35#include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
86 85
87 UFSD("BEGIN\n"); 86 UFSD("BEGIN\n");
88 87
89 dquot_initialize(dir);
90
91 inode = ufs_new_inode(dir, mode); 88 inode = ufs_new_inode(dir, mode);
92 err = PTR_ERR(inode); 89 err = PTR_ERR(inode);
93 90
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
112 if (!old_valid_dev(rdev)) 109 if (!old_valid_dev(rdev))
113 return -EINVAL; 110 return -EINVAL;
114 111
115 dquot_initialize(dir);
116
117 inode = ufs_new_inode(dir, mode); 112 inode = ufs_new_inode(dir, mode);
118 err = PTR_ERR(inode); 113 err = PTR_ERR(inode);
119 if (!IS_ERR(inode)) { 114 if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
138 if (l > sb->s_blocksize) 133 if (l > sb->s_blocksize)
139 goto out_notlocked; 134 goto out_notlocked;
140 135
141 dquot_initialize(dir);
142
143 lock_kernel(); 136 lock_kernel();
144 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 137 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
145 err = PTR_ERR(inode); 138 err = PTR_ERR(inode);
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
185 return -EMLINK; 178 return -EMLINK;
186 } 179 }
187 180
188 dquot_initialize(dir);
189
190 inode->i_ctime = CURRENT_TIME_SEC; 181 inode->i_ctime = CURRENT_TIME_SEC;
191 inode_inc_link_count(inode); 182 inode_inc_link_count(inode);
192 atomic_inc(&inode->i_count); 183 atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
204 if (dir->i_nlink >= UFS_LINK_MAX) 195 if (dir->i_nlink >= UFS_LINK_MAX)
205 goto out; 196 goto out;
206 197
207 dquot_initialize(dir);
208
209 lock_kernel(); 198 lock_kernel();
210 inode_inc_link_count(dir); 199 inode_inc_link_count(dir);
211 200
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
250 struct page *page; 239 struct page *page;
251 int err = -ENOENT; 240 int err = -ENOENT;
252 241
253 dquot_initialize(dir);
254
255 de = ufs_find_entry(dir, &dentry->d_name, &page); 242 de = ufs_find_entry(dir, &dentry->d_name, &page);
256 if (!de) 243 if (!de)
257 goto out; 244 goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
296 struct ufs_dir_entry *old_de; 283 struct ufs_dir_entry *old_de;
297 int err = -ENOENT; 284 int err = -ENOENT;
298 285
299 dquot_initialize(old_dir);
300 dquot_initialize(new_dir);
301
302 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 286 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
303 if (!old_de) 287 if (!old_de)
304 goto out; 288 goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..3ec5a9eb6efb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
77 77
78#include <linux/errno.h> 78#include <linux/errno.h>
79#include <linux/fs.h> 79#include <linux/fs.h>
80#include <linux/quotaops.h>
81#include <linux/slab.h> 80#include <linux/slab.h>
82#include <linux/time.h> 81#include <linux/time.h>
83#include <linux/stat.h> 82#include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
918 sbi->s_bytesex = BYTESEX_LE; 917 sbi->s_bytesex = BYTESEX_LE;
919 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 918 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
920 case UFS_MAGIC: 919 case UFS_MAGIC:
920 case UFS_MAGIC_BW:
921 case UFS2_MAGIC: 921 case UFS2_MAGIC:
922 case UFS_MAGIC_LFN: 922 case UFS_MAGIC_LFN:
923 case UFS_MAGIC_FEA: 923 case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
927 sbi->s_bytesex = BYTESEX_BE; 927 sbi->s_bytesex = BYTESEX_BE;
928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
929 case UFS_MAGIC: 929 case UFS_MAGIC:
930 case UFS_MAGIC_BW:
930 case UFS2_MAGIC: 931 case UFS2_MAGIC:
931 case UFS_MAGIC_LFN: 932 case UFS_MAGIC_LFN:
932 case UFS_MAGIC_FEA: 933 case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
1045 */ 1046 */
1046 sb->s_op = &ufs_super_ops; 1047 sb->s_op = &ufs_super_ops;
1047 sb->s_export_op = &ufs_export_ops; 1048 sb->s_export_op = &ufs_export_ops;
1048 sb->dq_op = NULL; /***/ 1049
1049 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); 1050 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
1050 1051
1051 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); 1052 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
1435 kmem_cache_destroy(ufs_inode_cachep); 1436 kmem_cache_destroy(ufs_inode_cachep);
1436} 1437}
1437 1438
1438static void ufs_clear_inode(struct inode *inode)
1439{
1440 dquot_drop(inode);
1441}
1442
1443#ifdef CONFIG_QUOTA
1444static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
1445static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
1446#endif
1447
1448static const struct super_operations ufs_super_ops = { 1439static const struct super_operations ufs_super_ops = {
1449 .alloc_inode = ufs_alloc_inode, 1440 .alloc_inode = ufs_alloc_inode,
1450 .destroy_inode = ufs_destroy_inode, 1441 .destroy_inode = ufs_destroy_inode,
1451 .write_inode = ufs_write_inode, 1442 .write_inode = ufs_write_inode,
1452 .delete_inode = ufs_delete_inode, 1443 .delete_inode = ufs_delete_inode,
1453 .clear_inode = ufs_clear_inode,
1454 .put_super = ufs_put_super, 1444 .put_super = ufs_put_super,
1455 .write_super = ufs_write_super, 1445 .write_super = ufs_write_super,
1456 .sync_fs = ufs_sync_fs, 1446 .sync_fs = ufs_sync_fs,
1457 .statfs = ufs_statfs, 1447 .statfs = ufs_statfs,
1458 .remount_fs = ufs_remount, 1448 .remount_fs = ufs_remount,
1459 .show_options = ufs_show_options, 1449 .show_options = ufs_show_options,
1460#ifdef CONFIG_QUOTA
1461 .quota_read = ufs_quota_read,
1462 .quota_write = ufs_quota_write,
1463#endif
1464}; 1450};
1465 1451
1466#ifdef CONFIG_QUOTA
1467
1468/* Read data from quotafile - avoid pagecache and such because we cannot afford
1469 * acquiring the locks... As quota files are never truncated and quota code
1470 * itself serializes the operations (and noone else should touch the files)
1471 * we don't have to be afraid of races */
1472static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
1473 size_t len, loff_t off)
1474{
1475 struct inode *inode = sb_dqopt(sb)->files[type];
1476 sector_t blk = off >> sb->s_blocksize_bits;
1477 int err = 0;
1478 int offset = off & (sb->s_blocksize - 1);
1479 int tocopy;
1480 size_t toread;
1481 struct buffer_head *bh;
1482 loff_t i_size = i_size_read(inode);
1483
1484 if (off > i_size)
1485 return 0;
1486 if (off+len > i_size)
1487 len = i_size-off;
1488 toread = len;
1489 while (toread > 0) {
1490 tocopy = sb->s_blocksize - offset < toread ?
1491 sb->s_blocksize - offset : toread;
1492
1493 bh = ufs_bread(inode, blk, 0, &err);
1494 if (err)
1495 return err;
1496 if (!bh) /* A hole? */
1497 memset(data, 0, tocopy);
1498 else {
1499 memcpy(data, bh->b_data+offset, tocopy);
1500 brelse(bh);
1501 }
1502 offset = 0;
1503 toread -= tocopy;
1504 data += tocopy;
1505 blk++;
1506 }
1507 return len;
1508}
1509
1510/* Write to quotafile */
1511static ssize_t ufs_quota_write(struct super_block *sb, int type,
1512 const char *data, size_t len, loff_t off)
1513{
1514 struct inode *inode = sb_dqopt(sb)->files[type];
1515 sector_t blk = off >> sb->s_blocksize_bits;
1516 int err = 0;
1517 int offset = off & (sb->s_blocksize - 1);
1518 int tocopy;
1519 size_t towrite = len;
1520 struct buffer_head *bh;
1521
1522 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
1523 while (towrite > 0) {
1524 tocopy = sb->s_blocksize - offset < towrite ?
1525 sb->s_blocksize - offset : towrite;
1526
1527 bh = ufs_bread(inode, blk, 1, &err);
1528 if (!bh)
1529 goto out;
1530 lock_buffer(bh);
1531 memcpy(bh->b_data+offset, data, tocopy);
1532 flush_dcache_page(bh->b_page);
1533 set_buffer_uptodate(bh);
1534 mark_buffer_dirty(bh);
1535 unlock_buffer(bh);
1536 brelse(bh);
1537 offset = 0;
1538 towrite -= tocopy;
1539 data += tocopy;
1540 blk++;
1541 }
1542out:
1543 if (len == towrite) {
1544 mutex_unlock(&inode->i_mutex);
1545 return err;
1546 }
1547 if (inode->i_size < off+len-towrite)
1548 i_size_write(inode, off+len-towrite);
1549 inode->i_version++;
1550 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1551 mark_inode_dirty(inode);
1552 mutex_unlock(&inode->i_mutex);
1553 return len - towrite;
1554}
1555
1556#endif
1557
1558static int ufs_get_sb(struct file_system_type *fs_type, 1452static int ufs_get_sb(struct file_system_type *fs_type,
1559 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1453 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
1560{ 1454{
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index f294c44577dc..589e01a465ba 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
44#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/quotaops.h>
48 47
49#include "ufs_fs.h" 48#include "ufs_fs.h"
50#include "ufs.h" 49#include "ufs.h"
@@ -501,12 +500,10 @@ out:
501 return err; 500 return err;
502} 501}
503 502
504
505/* 503/*
506 * We don't define our `inode->i_op->truncate', and call it here, 504 * TODO:
507 * because of: 505 * - truncate case should use proper ordering instead of using
508 * - there is no way to know old size 506 * simple_setsize
509 * - there is no way inform user about error, if it happens in `truncate'
510 */ 507 */
511int ufs_setattr(struct dentry *dentry, struct iattr *attr) 508int ufs_setattr(struct dentry *dentry, struct iattr *attr)
512{ 509{
@@ -518,19 +515,10 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
518 if (error) 515 if (error)
519 return error; 516 return error;
520 517
521 if (is_quota_modification(inode, attr))
522 dquot_initialize(inode);
523
524 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
525 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
526 error = dquot_transfer(inode, attr);
527 if (error)
528 return error;
529 }
530 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { 518 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
531 loff_t old_i_size = inode->i_size; 519 loff_t old_i_size = inode->i_size;
532 520
533 error = vmtruncate(inode, attr->ia_size); 521 error = simple_setsize(inode, attr->ia_size);
534 if (error) 522 if (error)
535 return error; 523 return error;
536 error = ufs_truncate(inode, old_i_size); 524 error = ufs_truncate(inode, old_i_size);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
48#define UFS_SECTOR_SIZE 512 48#define UFS_SECTOR_SIZE 512
49#define UFS_SECTOR_BITS 9 49#define UFS_SECTOR_BITS 9
50#define UFS_MAGIC 0x00011954 50#define UFS_MAGIC 0x00011954
51#define UFS_MAGIC_BW 0x0f242697
51#define UFS2_MAGIC 0x19540119 52#define UFS2_MAGIC 0x19540119
52#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ 53#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */
53 54
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 089eaca860b4..34640d6dbdcb 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1333,6 +1333,21 @@ xfs_vm_writepage(
1333 trace_xfs_writepage(inode, page, 0); 1333 trace_xfs_writepage(inode, page, 0);
1334 1334
1335 /* 1335 /*
1336 * Refuse to write the page out if we are called from reclaim context.
1337 *
1338 * This is primarily to avoid stack overflows when called from deep
1339 * used stacks in random callers for direct reclaim, but disabling
1340 * reclaim for kswap is a nice side-effect as kswapd causes rather
1341 * suboptimal I/O patters, too.
1342 *
1343 * This should really be done by the core VM, but until that happens
1344 * filesystems like XFS, btrfs and ext4 have to take care of this
1345 * by themselves.
1346 */
1347 if (current->flags & PF_MEMALLOC)
1348 goto out_fail;
1349
1350 /*
1336 * We need a transaction if: 1351 * We need a transaction if:
1337 * 1. There are delalloc buffers on the page 1352 * 1. There are delalloc buffers on the page
1338 * 2. The page is uptodate and we have unmapped buffers 1353 * 2. The page is uptodate and we have unmapped buffers
@@ -1366,14 +1381,6 @@ xfs_vm_writepage(
1366 if (!page_has_buffers(page)) 1381 if (!page_has_buffers(page))
1367 create_empty_buffers(page, 1 << inode->i_blkbits, 0); 1382 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1368 1383
1369
1370 /*
1371 * VM calculation for nr_to_write seems off. Bump it way
1372 * up, this gets simple streaming writes zippy again.
1373 * To be reviewed again after Jens' writeback changes.
1374 */
1375 wbc->nr_to_write *= 4;
1376
1377 /* 1384 /*
1378 * Convert delayed allocate, unwritten or unmapped space 1385 * Convert delayed allocate, unwritten or unmapped space
1379 * to real space and flush out to disk. 1386 * to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index d8fb1b5d6cb5..257a56b127cf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -100,10 +100,10 @@ xfs_iozero(
100STATIC int 100STATIC int
101xfs_file_fsync( 101xfs_file_fsync(
102 struct file *file, 102 struct file *file,
103 struct dentry *dentry,
104 int datasync) 103 int datasync)
105{ 104{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode); 105 struct inode *inode = file->f_mapping->host;
106 struct xfs_inode *ip = XFS_I(inode);
107 struct xfs_trans *tp; 107 struct xfs_trans *tp;
108 int error = 0; 108 int error = 0;
109 int log_flushed = 0; 109 int log_flushed = 0;
@@ -140,8 +140,8 @@ xfs_file_fsync(
140 * might gets cleared when the inode gets written out via the AIL 140 * might gets cleared when the inode gets written out via the AIL
141 * or xfs_iflush_cluster. 141 * or xfs_iflush_cluster.
142 */ 142 */
143 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || 143 if (((inode->i_state & I_DIRTY_DATASYNC) ||
144 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && 144 ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
145 ip->i_update_core) { 145 ip->i_update_core) {
146 /* 146 /*
147 * Kick off a transaction to log the inode core to get the 147 * Kick off a transaction to log the inode core to get the
@@ -868,7 +868,7 @@ write_retry:
868 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
869 xfs_ilock(ip, iolock); 869 xfs_ilock(ip, iolock);
870 870
871 error2 = -xfs_file_fsync(file, file->f_path.dentry, 871 error2 = -xfs_file_fsync(file,
872 (file->f_flags & __O_SYNC) ? 0 : 1); 872 (file->f_flags & __O_SYNC) ? 0 : 1);
873 if (!error) 873 if (!error)
874 error = error2; 874 error = error2;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 9c8019c78c92..44f0b2de153e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -585,11 +585,20 @@ xfs_vn_fallocate(
585 bf.l_len = len; 585 bf.l_len = len;
586 586
587 xfs_ilock(ip, XFS_IOLOCK_EXCL); 587 xfs_ilock(ip, XFS_IOLOCK_EXCL);
588
589 /* check the new inode size is valid before allocating */
590 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
591 offset + len > i_size_read(inode)) {
592 new_size = offset + len;
593 error = inode_newsize_ok(inode, new_size);
594 if (error)
595 goto out_unlock;
596 }
597
588 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, 598 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
589 0, XFS_ATTR_NOLOCK); 599 0, XFS_ATTR_NOLOCK);
590 if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && 600 if (error)
591 offset + len > i_size_read(inode)) 601 goto out_unlock;
592 new_size = offset + len;
593 602
594 /* Change file size if needed */ 603 /* Change file size if needed */
595 if (new_size) { 604 if (new_size) {
@@ -600,6 +609,7 @@ xfs_vn_fallocate(
600 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); 609 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
601 } 610 }
602 611
612out_unlock:
603 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 613 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
604out_error: 614out_error:
605 return error; 615 return error;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 9ac8aea91529..067cafbfc635 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -23,7 +23,6 @@
23#include "xfs_ag.h" 23#include "xfs_ag.h"
24#include "xfs_mount.h" 24#include "xfs_mount.h"
25#include "xfs_quota.h" 25#include "xfs_quota.h"
26#include "xfs_log.h"
27#include "xfs_trans.h" 26#include "xfs_trans.h"
28#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
29#include "xfs_inode.h" 28#include "xfs_inode.h"
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3884e20bc14e..ef7f0218bccb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -164,10 +164,6 @@ xfs_inode_ag_iterator(
164 struct xfs_perag *pag; 164 struct xfs_perag *pag;
165 165
166 pag = xfs_perag_get(mp, ag); 166 pag = xfs_perag_get(mp, ag);
167 if (!pag->pag_ici_init) {
168 xfs_perag_put(pag);
169 continue;
170 }
171 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
172 exclusive, &nr); 168 exclusive, &nr);
173 xfs_perag_put(pag); 169 xfs_perag_put(pag);
@@ -867,12 +863,7 @@ xfs_reclaim_inode_shrink(
867 down_read(&xfs_mount_list_lock); 863 down_read(&xfs_mount_list_lock);
868 list_for_each_entry(mp, &xfs_mount_list, m_mplist) { 864 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
869 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 865 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
870
871 pag = xfs_perag_get(mp, ag); 866 pag = xfs_perag_get(mp, ag);
872 if (!pag->pag_ici_init) {
873 xfs_perag_put(pag);
874 continue;
875 }
876 reclaimable += pag->pag_ici_reclaimable; 867 reclaimable += pag->pag_ici_reclaimable;
877 xfs_perag_put(pag); 868 xfs_perag_put(pag);
878 } 869 }
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 207fa77f63ae..d12be8470cba 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -50,7 +50,6 @@
50#include "quota/xfs_dquot_item.h" 50#include "quota/xfs_dquot_item.h"
51#include "quota/xfs_dquot.h" 51#include "quota/xfs_dquot.h"
52#include "xfs_log_recover.h" 52#include "xfs_log_recover.h"
53#include "xfs_buf_item.h"
54#include "xfs_inode_item.h" 53#include "xfs_inode_item.h"
55 54
56/* 55/*
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index ff6bc797baf2..73d5aa117384 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -82,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
82 ) 82 )
83) 83)
84 84
85#define DEFINE_PERAG_REF_EVENT(name) \
86TRACE_EVENT(name, \
87 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
88 unsigned long caller_ip), \
89 TP_ARGS(mp, agno, refcount, caller_ip), \
90 TP_STRUCT__entry( \
91 __field(dev_t, dev) \
92 __field(xfs_agnumber_t, agno) \
93 __field(int, refcount) \
94 __field(unsigned long, caller_ip) \
95 ), \
96 TP_fast_assign( \
97 __entry->dev = mp->m_super->s_dev; \
98 __entry->agno = agno; \
99 __entry->refcount = refcount; \
100 __entry->caller_ip = caller_ip; \
101 ), \
102 TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
103 MAJOR(__entry->dev), MINOR(__entry->dev), \
104 __entry->agno, \
105 __entry->refcount, \
106 (char *)__entry->caller_ip) \
107);
108
109DEFINE_PERAG_REF_EVENT(xfs_perag_get)
110DEFINE_PERAG_REF_EVENT(xfs_perag_put)
111
112#define DEFINE_ATTR_LIST_EVENT(name) \ 85#define DEFINE_ATTR_LIST_EVENT(name) \
113DEFINE_EVENT(xfs_attr_list_class, name, \ 86DEFINE_EVENT(xfs_attr_list_class, name, \
114 TP_PROTO(struct xfs_attr_list_context *ctx), \ 87 TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -122,6 +95,37 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
122DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); 95DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
123DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); 96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
124 97
98DECLARE_EVENT_CLASS(xfs_perag_class,
99 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
100 unsigned long caller_ip),
101 TP_ARGS(mp, agno, refcount, caller_ip),
102 TP_STRUCT__entry(
103 __field(dev_t, dev)
104 __field(xfs_agnumber_t, agno)
105 __field(int, refcount)
106 __field(unsigned long, caller_ip)
107 ),
108 TP_fast_assign(
109 __entry->dev = mp->m_super->s_dev;
110 __entry->agno = agno;
111 __entry->refcount = refcount;
112 __entry->caller_ip = caller_ip;
113 ),
114 TP_printk("dev %d:%d agno %u refcount %d caller %pf",
115 MAJOR(__entry->dev), MINOR(__entry->dev),
116 __entry->agno,
117 __entry->refcount,
118 (char *)__entry->caller_ip)
119);
120
121#define DEFINE_PERAG_REF_EVENT(name) \
122DEFINE_EVENT(xfs_perag_class, name, \
123 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_put);
128
125TRACE_EVENT(xfs_attr_list_node_descend, 129TRACE_EVENT(xfs_attr_list_node_descend,
126 TP_PROTO(struct xfs_attr_list_context *ctx, 130 TP_PROTO(struct xfs_attr_list_context *ctx,
127 struct xfs_da_node_entry *btree), 131 struct xfs_da_node_entry *btree),
@@ -775,165 +779,181 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
775DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); 779DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
776DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); 780DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
777 781
778#define DEFINE_RW_EVENT(name) \ 782DECLARE_EVENT_CLASS(xfs_file_class,
779TRACE_EVENT(name, \ 783 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
780 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ 784 TP_ARGS(ip, count, offset, flags),
781 TP_ARGS(ip, count, offset, flags), \ 785 TP_STRUCT__entry(
782 TP_STRUCT__entry( \ 786 __field(dev_t, dev)
783 __field(dev_t, dev) \ 787 __field(xfs_ino_t, ino)
784 __field(xfs_ino_t, ino) \ 788 __field(xfs_fsize_t, size)
785 __field(xfs_fsize_t, size) \ 789 __field(xfs_fsize_t, new_size)
786 __field(xfs_fsize_t, new_size) \ 790 __field(loff_t, offset)
787 __field(loff_t, offset) \ 791 __field(size_t, count)
788 __field(size_t, count) \ 792 __field(int, flags)
789 __field(int, flags) \ 793 ),
790 ), \ 794 TP_fast_assign(
791 TP_fast_assign( \ 795 __entry->dev = VFS_I(ip)->i_sb->s_dev;
792 __entry->dev = VFS_I(ip)->i_sb->s_dev; \ 796 __entry->ino = ip->i_ino;
793 __entry->ino = ip->i_ino; \ 797 __entry->size = ip->i_d.di_size;
794 __entry->size = ip->i_d.di_size; \ 798 __entry->new_size = ip->i_new_size;
795 __entry->new_size = ip->i_new_size; \ 799 __entry->offset = offset;
796 __entry->offset = offset; \ 800 __entry->count = count;
797 __entry->count = count; \ 801 __entry->flags = flags;
798 __entry->flags = flags; \ 802 ),
799 ), \ 803 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
800 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 804 "offset 0x%llx count 0x%zx ioflags %s",
801 "offset 0x%llx count 0x%zx ioflags %s", \ 805 MAJOR(__entry->dev), MINOR(__entry->dev),
802 MAJOR(__entry->dev), MINOR(__entry->dev), \ 806 __entry->ino,
803 __entry->ino, \ 807 __entry->size,
804 __entry->size, \ 808 __entry->new_size,
805 __entry->new_size, \ 809 __entry->offset,
806 __entry->offset, \ 810 __entry->count,
807 __entry->count, \ 811 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
808 __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
809) 812)
813
814#define DEFINE_RW_EVENT(name) \
815DEFINE_EVENT(xfs_file_class, name, \
816 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
817 TP_ARGS(ip, count, offset, flags))
810DEFINE_RW_EVENT(xfs_file_read); 818DEFINE_RW_EVENT(xfs_file_read);
811DEFINE_RW_EVENT(xfs_file_buffered_write); 819DEFINE_RW_EVENT(xfs_file_buffered_write);
812DEFINE_RW_EVENT(xfs_file_direct_write); 820DEFINE_RW_EVENT(xfs_file_direct_write);
813DEFINE_RW_EVENT(xfs_file_splice_read); 821DEFINE_RW_EVENT(xfs_file_splice_read);
814DEFINE_RW_EVENT(xfs_file_splice_write); 822DEFINE_RW_EVENT(xfs_file_splice_write);
815 823
816 824DECLARE_EVENT_CLASS(xfs_page_class,
817#define DEFINE_PAGE_EVENT(name) \ 825 TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
818TRACE_EVENT(name, \ 826 TP_ARGS(inode, page, off),
819 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ 827 TP_STRUCT__entry(
820 TP_ARGS(inode, page, off), \ 828 __field(dev_t, dev)
821 TP_STRUCT__entry( \ 829 __field(xfs_ino_t, ino)
822 __field(dev_t, dev) \ 830 __field(pgoff_t, pgoff)
823 __field(xfs_ino_t, ino) \ 831 __field(loff_t, size)
824 __field(pgoff_t, pgoff) \ 832 __field(unsigned long, offset)
825 __field(loff_t, size) \ 833 __field(int, delalloc)
826 __field(unsigned long, offset) \ 834 __field(int, unmapped)
827 __field(int, delalloc) \ 835 __field(int, unwritten)
828 __field(int, unmapped) \ 836 ),
829 __field(int, unwritten) \ 837 TP_fast_assign(
830 ), \ 838 int delalloc = -1, unmapped = -1, unwritten = -1;
831 TP_fast_assign( \ 839
832 int delalloc = -1, unmapped = -1, unwritten = -1; \ 840 if (page_has_buffers(page))
833 \ 841 xfs_count_page_state(page, &delalloc,
834 if (page_has_buffers(page)) \ 842 &unmapped, &unwritten);
835 xfs_count_page_state(page, &delalloc, \ 843 __entry->dev = inode->i_sb->s_dev;
836 &unmapped, &unwritten); \ 844 __entry->ino = XFS_I(inode)->i_ino;
837 __entry->dev = inode->i_sb->s_dev; \ 845 __entry->pgoff = page_offset(page);
838 __entry->ino = XFS_I(inode)->i_ino; \ 846 __entry->size = i_size_read(inode);
839 __entry->pgoff = page_offset(page); \ 847 __entry->offset = off;
840 __entry->size = i_size_read(inode); \ 848 __entry->delalloc = delalloc;
841 __entry->offset = off; \ 849 __entry->unmapped = unmapped;
842 __entry->delalloc = delalloc; \ 850 __entry->unwritten = unwritten;
843 __entry->unmapped = unmapped; \ 851 ),
844 __entry->unwritten = unwritten; \ 852 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
845 ), \ 853 "delalloc %d unmapped %d unwritten %d",
846 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \ 854 MAJOR(__entry->dev), MINOR(__entry->dev),
847 "delalloc %d unmapped %d unwritten %d", \ 855 __entry->ino,
848 MAJOR(__entry->dev), MINOR(__entry->dev), \ 856 __entry->pgoff,
849 __entry->ino, \ 857 __entry->size,
850 __entry->pgoff, \ 858 __entry->offset,
851 __entry->size, \ 859 __entry->delalloc,
852 __entry->offset, \ 860 __entry->unmapped,
853 __entry->delalloc, \ 861 __entry->unwritten)
854 __entry->unmapped, \
855 __entry->unwritten) \
856) 862)
863
864#define DEFINE_PAGE_EVENT(name) \
865DEFINE_EVENT(xfs_page_class, name, \
866 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
867 TP_ARGS(inode, page, off))
857DEFINE_PAGE_EVENT(xfs_writepage); 868DEFINE_PAGE_EVENT(xfs_writepage);
858DEFINE_PAGE_EVENT(xfs_releasepage); 869DEFINE_PAGE_EVENT(xfs_releasepage);
859DEFINE_PAGE_EVENT(xfs_invalidatepage); 870DEFINE_PAGE_EVENT(xfs_invalidatepage);
860 871
861#define DEFINE_IOMAP_EVENT(name) \ 872DECLARE_EVENT_CLASS(xfs_iomap_class,
862TRACE_EVENT(name, \ 873 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
863 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 874 int flags, struct xfs_bmbt_irec *irec),
864 int flags, struct xfs_bmbt_irec *irec), \ 875 TP_ARGS(ip, offset, count, flags, irec),
865 TP_ARGS(ip, offset, count, flags, irec), \ 876 TP_STRUCT__entry(
866 TP_STRUCT__entry( \ 877 __field(dev_t, dev)
867 __field(dev_t, dev) \ 878 __field(xfs_ino_t, ino)
868 __field(xfs_ino_t, ino) \ 879 __field(loff_t, size)
869 __field(loff_t, size) \ 880 __field(loff_t, new_size)
870 __field(loff_t, new_size) \ 881 __field(loff_t, offset)
871 __field(loff_t, offset) \ 882 __field(size_t, count)
872 __field(size_t, count) \ 883 __field(int, flags)
873 __field(int, flags) \ 884 __field(xfs_fileoff_t, startoff)
874 __field(xfs_fileoff_t, startoff) \ 885 __field(xfs_fsblock_t, startblock)
875 __field(xfs_fsblock_t, startblock) \ 886 __field(xfs_filblks_t, blockcount)
876 __field(xfs_filblks_t, blockcount) \ 887 ),
877 ), \ 888 TP_fast_assign(
878 TP_fast_assign( \ 889 __entry->dev = VFS_I(ip)->i_sb->s_dev;
879 __entry->dev = VFS_I(ip)->i_sb->s_dev; \ 890 __entry->ino = ip->i_ino;
880 __entry->ino = ip->i_ino; \ 891 __entry->size = ip->i_d.di_size;
881 __entry->size = ip->i_d.di_size; \ 892 __entry->new_size = ip->i_new_size;
882 __entry->new_size = ip->i_new_size; \ 893 __entry->offset = offset;
883 __entry->offset = offset; \ 894 __entry->count = count;
884 __entry->count = count; \ 895 __entry->flags = flags;
885 __entry->flags = flags; \ 896 __entry->startoff = irec ? irec->br_startoff : 0;
886 __entry->startoff = irec ? irec->br_startoff : 0; \ 897 __entry->startblock = irec ? irec->br_startblock : 0;
887 __entry->startblock = irec ? irec->br_startblock : 0; \ 898 __entry->blockcount = irec ? irec->br_blockcount : 0;
888 __entry->blockcount = irec ? irec->br_blockcount : 0; \ 899 ),
889 ), \ 900 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
890 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 901 "offset 0x%llx count %zd flags %s "
891 "offset 0x%llx count %zd flags %s " \ 902 "startoff 0x%llx startblock %lld blockcount 0x%llx",
892 "startoff 0x%llx startblock %lld blockcount 0x%llx", \ 903 MAJOR(__entry->dev), MINOR(__entry->dev),
893 MAJOR(__entry->dev), MINOR(__entry->dev), \ 904 __entry->ino,
894 __entry->ino, \ 905 __entry->size,
895 __entry->size, \ 906 __entry->new_size,
896 __entry->new_size, \ 907 __entry->offset,
897 __entry->offset, \ 908 __entry->count,
898 __entry->count, \ 909 __print_flags(__entry->flags, "|", BMAPI_FLAGS),
899 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ 910 __entry->startoff,
900 __entry->startoff, \ 911 (__int64_t)__entry->startblock,
901 (__int64_t)__entry->startblock, \ 912 __entry->blockcount)
902 __entry->blockcount) \
903) 913)
914
915#define DEFINE_IOMAP_EVENT(name) \
916DEFINE_EVENT(xfs_iomap_class, name, \
917 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
918 int flags, struct xfs_bmbt_irec *irec), \
919 TP_ARGS(ip, offset, count, flags, irec))
904DEFINE_IOMAP_EVENT(xfs_iomap_enter); 920DEFINE_IOMAP_EVENT(xfs_iomap_enter);
905DEFINE_IOMAP_EVENT(xfs_iomap_found); 921DEFINE_IOMAP_EVENT(xfs_iomap_found);
906DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 922DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
907 923
908#define DEFINE_SIMPLE_IO_EVENT(name) \ 924DECLARE_EVENT_CLASS(xfs_simple_io_class,
909TRACE_EVENT(name, \ 925 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
910 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ 926 TP_ARGS(ip, offset, count),
911 TP_ARGS(ip, offset, count), \ 927 TP_STRUCT__entry(
912 TP_STRUCT__entry( \ 928 __field(dev_t, dev)
913 __field(dev_t, dev) \ 929 __field(xfs_ino_t, ino)
914 __field(xfs_ino_t, ino) \ 930 __field(loff_t, size)
915 __field(loff_t, size) \ 931 __field(loff_t, new_size)
916 __field(loff_t, new_size) \ 932 __field(loff_t, offset)
917 __field(loff_t, offset) \ 933 __field(size_t, count)
918 __field(size_t, count) \ 934 ),
919 ), \ 935 TP_fast_assign(
920 TP_fast_assign( \ 936 __entry->dev = VFS_I(ip)->i_sb->s_dev;
921 __entry->dev = VFS_I(ip)->i_sb->s_dev; \ 937 __entry->ino = ip->i_ino;
922 __entry->ino = ip->i_ino; \ 938 __entry->size = ip->i_d.di_size;
923 __entry->size = ip->i_d.di_size; \ 939 __entry->new_size = ip->i_new_size;
924 __entry->new_size = ip->i_new_size; \ 940 __entry->offset = offset;
925 __entry->offset = offset; \ 941 __entry->count = count;
926 __entry->count = count; \ 942 ),
927 ), \ 943 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
928 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 944 "offset 0x%llx count %zd",
929 "offset 0x%llx count %zd", \ 945 MAJOR(__entry->dev), MINOR(__entry->dev),
930 MAJOR(__entry->dev), MINOR(__entry->dev), \ 946 __entry->ino,
931 __entry->ino, \ 947 __entry->size,
932 __entry->size, \ 948 __entry->new_size,
933 __entry->new_size, \ 949 __entry->offset,
934 __entry->offset, \ 950 __entry->count)
935 __entry->count) \
936); 951);
952
953#define DEFINE_SIMPLE_IO_EVENT(name) \
954DEFINE_EVENT(xfs_simple_io_class, name, \
955 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
956 TP_ARGS(ip, offset, count))
937DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 957DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
938DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 958DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
939 959
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 38e764146644..2d8b7bc792c9 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -249,8 +249,10 @@ xfs_qm_hold_quotafs_ref(
249 249
250 if (!xfs_Gqm) { 250 if (!xfs_Gqm) {
251 xfs_Gqm = xfs_Gqm_init(); 251 xfs_Gqm = xfs_Gqm_init();
252 if (!xfs_Gqm) 252 if (!xfs_Gqm) {
253 mutex_unlock(&xfs_Gqm_lock);
253 return ENOMEM; 254 return ENOMEM;
255 }
254 } 256 }
255 257
256 /* 258 /*
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 401f364ad36c..4917d4eed4ed 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,6 @@ typedef struct xfs_perag {
227 227
228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
229 229
230 int pag_ici_init; /* incore inode cache initialised */
231 rwlock_t pag_ici_lock; /* incore inode lock */ 230 rwlock_t pag_ici_lock; /* incore inode lock */
232 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
233 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db90818f..75df75f43d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -382,9 +382,6 @@ xfs_iget(
382 382
383 /* get the perag structure and ensure that it's inode capable */ 383 /* get the perag structure and ensure that it's inode capable */
384 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 384 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
385 if (!pag->pagi_inodeok)
386 return EINVAL;
387 ASSERT(pag->pag_ici_init);
388 agino = XFS_INO_TO_AGINO(mp, ino); 385 agino = XFS_INO_TO_AGINO(mp, ino);
389 386
390again: 387again:
@@ -744,30 +741,24 @@ xfs_ilock_demote(
744} 741}
745 742
746#ifdef DEBUG 743#ifdef DEBUG
747/*
748 * Debug-only routine, without additional rw_semaphore APIs, we can
749 * now only answer requests regarding whether we hold the lock for write
750 * (reader state is outside our visibility, we only track writer state).
751 *
752 * Note: this means !xfs_isilocked would give false positives, so don't do that.
753 */
754int 744int
755xfs_isilocked( 745xfs_isilocked(
756 xfs_inode_t *ip, 746 xfs_inode_t *ip,
757 uint lock_flags) 747 uint lock_flags)
758{ 748{
759 if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) == 749 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
760 XFS_ILOCK_EXCL) { 750 if (!(lock_flags & XFS_ILOCK_SHARED))
761 if (!ip->i_lock.mr_writer) 751 return !!ip->i_lock.mr_writer;
762 return 0; 752 return rwsem_is_locked(&ip->i_lock.mr_lock);
763 } 753 }
764 754
765 if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) == 755 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
766 XFS_IOLOCK_EXCL) { 756 if (!(lock_flags & XFS_IOLOCK_SHARED))
767 if (!ip->i_iolock.mr_writer) 757 return !!ip->i_iolock.mr_writer;
768 return 0; 758 return rwsem_is_locked(&ip->i_iolock.mr_lock);
769 } 759 }
770 760
771 return 1; 761 ASSERT(0);
762 return 0;
772} 763}
773#endif 764#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8cd6e8d8fe9c..d53c39de7d05 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster(
1940 int blks_per_cluster; 1940 int blks_per_cluster;
1941 int nbufs; 1941 int nbufs;
1942 int ninodes; 1942 int ninodes;
1943 int i, j, found, pre_flushed; 1943 int i, j;
1944 xfs_daddr_t blkno; 1944 xfs_daddr_t blkno;
1945 xfs_buf_t *bp; 1945 xfs_buf_t *bp;
1946 xfs_inode_t *ip, **ip_found; 1946 xfs_inode_t *ip;
1947 xfs_inode_log_item_t *iip; 1947 xfs_inode_log_item_t *iip;
1948 xfs_log_item_t *lip; 1948 xfs_log_item_t *lip;
1949 struct xfs_perag *pag; 1949 struct xfs_perag *pag;
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster(
1960 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 1960 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1961 } 1961 }
1962 1962
1963 ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
1964
1965 for (j = 0; j < nbufs; j++, inum += ninodes) { 1963 for (j = 0; j < nbufs; j++, inum += ninodes) {
1964 int found = 0;
1965
1966 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1966 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1967 XFS_INO_TO_AGBNO(mp, inum)); 1967 XFS_INO_TO_AGBNO(mp, inum));
1968 1968
1969 /*
1970 * We obtain and lock the backing buffer first in the process
1971 * here, as we have to ensure that any dirty inode that we
1972 * can't get the flush lock on is attached to the buffer.
1973 * If we scan the in-memory inodes first, then buffer IO can
1974 * complete before we get a lock on it, and hence we may fail
1975 * to mark all the active inodes on the buffer stale.
1976 */
1977 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1978 mp->m_bsize * blks_per_cluster,
1979 XBF_LOCK);
1980
1981 /*
1982 * Walk the inodes already attached to the buffer and mark them
1983 * stale. These will all have the flush locks held, so an
1984 * in-memory inode walk can't lock them.
1985 */
1986 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1987 while (lip) {
1988 if (lip->li_type == XFS_LI_INODE) {
1989 iip = (xfs_inode_log_item_t *)lip;
1990 ASSERT(iip->ili_logged == 1);
1991 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
1992 xfs_trans_ail_copy_lsn(mp->m_ail,
1993 &iip->ili_flush_lsn,
1994 &iip->ili_item.li_lsn);
1995 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1996 found++;
1997 }
1998 lip = lip->li_bio_list;
1999 }
1969 2000
1970 /* 2001 /*
1971 * Look for each inode in memory and attempt to lock it, 2002 * For each inode in memory attempt to add it to the inode
1972 * we can be racing with flush and tail pushing here. 2003 * buffer and set it up for being staled on buffer IO
1973 * any inode we get the locks on, add to an array of 2004 * completion. This is safe as we've locked out tail pushing
1974 * inode items to process later. 2005 * and flushing by locking the buffer.
1975 * 2006 *
1976 * The get the buffer lock, we could beat a flush 2007 * We have already marked every inode that was part of a
1977 * or tail pushing thread to the lock here, in which 2008 * transaction stale above, which means there is no point in
1978 * case they will go looking for the inode buffer 2009 * even trying to lock them.
1979 * and fail, we need some other form of interlock
1980 * here.
1981 */ 2010 */
1982 found = 0;
1983 for (i = 0; i < ninodes; i++) { 2011 for (i = 0; i < ninodes; i++) {
1984 read_lock(&pag->pag_ici_lock); 2012 read_lock(&pag->pag_ici_lock);
1985 ip = radix_tree_lookup(&pag->pag_ici_root, 2013 ip = radix_tree_lookup(&pag->pag_ici_root,
1986 XFS_INO_TO_AGINO(mp, (inum + i))); 2014 XFS_INO_TO_AGINO(mp, (inum + i)));
1987 2015
1988 /* Inode not in memory or we found it already, 2016 /* Inode not in memory or stale, nothing to do */
1989 * nothing to do
1990 */
1991 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2017 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
1992 read_unlock(&pag->pag_ici_lock); 2018 read_unlock(&pag->pag_ici_lock);
1993 continue; 2019 continue;
1994 } 2020 }
1995 2021
1996 if (xfs_inode_clean(ip)) { 2022 /* don't try to lock/unlock the current inode */
1997 read_unlock(&pag->pag_ici_lock); 2023 if (ip != free_ip &&
1998 continue; 2024 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1999 }
2000
2001 /* If we can get the locks then add it to the
2002 * list, otherwise by the time we get the bp lock
2003 * below it will already be attached to the
2004 * inode buffer.
2005 */
2006
2007 /* This inode will already be locked - by us, lets
2008 * keep it that way.
2009 */
2010
2011 if (ip == free_ip) {
2012 if (xfs_iflock_nowait(ip)) {
2013 xfs_iflags_set(ip, XFS_ISTALE);
2014 if (xfs_inode_clean(ip)) {
2015 xfs_ifunlock(ip);
2016 } else {
2017 ip_found[found++] = ip;
2018 }
2019 }
2020 read_unlock(&pag->pag_ici_lock); 2025 read_unlock(&pag->pag_ici_lock);
2021 continue; 2026 continue;
2022 } 2027 }
2028 read_unlock(&pag->pag_ici_lock);
2023 2029
2024 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2030 if (!xfs_iflock_nowait(ip)) {
2025 if (xfs_iflock_nowait(ip)) { 2031 if (ip != free_ip)
2026 xfs_iflags_set(ip, XFS_ISTALE);
2027
2028 if (xfs_inode_clean(ip)) {
2029 xfs_ifunlock(ip);
2030 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2031 } else {
2032 ip_found[found++] = ip;
2033 }
2034 } else {
2035 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2032 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2036 } 2033 continue;
2037 } 2034 }
2038 read_unlock(&pag->pag_ici_lock);
2039 }
2040 2035
2041 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2036 xfs_iflags_set(ip, XFS_ISTALE);
2042 mp->m_bsize * blks_per_cluster, 2037 if (xfs_inode_clean(ip)) {
2043 XBF_LOCK); 2038 ASSERT(ip != free_ip);
2044 2039 xfs_ifunlock(ip);
2045 pre_flushed = 0; 2040 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2046 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2041 continue;
2047 while (lip) {
2048 if (lip->li_type == XFS_LI_INODE) {
2049 iip = (xfs_inode_log_item_t *)lip;
2050 ASSERT(iip->ili_logged == 1);
2051 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2052 xfs_trans_ail_copy_lsn(mp->m_ail,
2053 &iip->ili_flush_lsn,
2054 &iip->ili_item.li_lsn);
2055 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2056 pre_flushed++;
2057 } 2042 }
2058 lip = lip->li_bio_list;
2059 }
2060 2043
2061 for (i = 0; i < found; i++) {
2062 ip = ip_found[i];
2063 iip = ip->i_itemp; 2044 iip = ip->i_itemp;
2064
2065 if (!iip) { 2045 if (!iip) {
2046 /* inode with unlogged changes only */
2047 ASSERT(ip != free_ip);
2066 ip->i_update_core = 0; 2048 ip->i_update_core = 0;
2067 xfs_ifunlock(ip); 2049 xfs_ifunlock(ip);
2068 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2050 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2069 continue; 2051 continue;
2070 } 2052 }
2053 found++;
2071 2054
2072 iip->ili_last_fields = iip->ili_format.ilf_fields; 2055 iip->ili_last_fields = iip->ili_format.ilf_fields;
2073 iip->ili_format.ilf_fields = 0; 2056 iip->ili_format.ilf_fields = 0;
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster(
2078 xfs_buf_attach_iodone(bp, 2061 xfs_buf_attach_iodone(bp,
2079 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2062 (void(*)(xfs_buf_t*,xfs_log_item_t*))
2080 xfs_istale_done, (xfs_log_item_t *)iip); 2063 xfs_istale_done, (xfs_log_item_t *)iip);
2081 if (ip != free_ip) { 2064
2065 if (ip != free_ip)
2082 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2066 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2083 }
2084 } 2067 }
2085 2068
2086 if (found || pre_flushed) 2069 if (found)
2087 xfs_trans_stale_inode_buf(tp, bp); 2070 xfs_trans_stale_inode_buf(tp, bp);
2088 xfs_trans_binval(tp, bp); 2071 xfs_trans_binval(tp, bp);
2089 } 2072 }
2090 2073
2091 kmem_free(ip_found);
2092 xfs_perag_put(pag); 2074 xfs_perag_put(pag);
2093} 2075}
2094 2076
@@ -2649,8 +2631,6 @@ xfs_iflush_cluster(
2649 int i; 2631 int i;
2650 2632
2651 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2633 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2652 ASSERT(pag->pagi_inodeok);
2653 ASSERT(pag->pag_ici_init);
2654 2634
2655 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2635 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2656 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2636 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 14a69aec2c0b..ed0684cc50ee 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -132,15 +132,10 @@ xlog_align(
132 int nbblks, 132 int nbblks,
133 xfs_buf_t *bp) 133 xfs_buf_t *bp)
134{ 134{
135 xfs_daddr_t offset; 135 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
136 xfs_caddr_t ptr;
137 136
138 offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1); 137 ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
139 ptr = XFS_BUF_PTR(bp) + BBTOB(offset); 138 return XFS_BUF_PTR(bp) + BBTOB(offset);
140
141 ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
142
143 return ptr;
144} 139}
145 140
146 141
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d7bf38c8cd1c..d59f4e8bedcf 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -268,10 +268,10 @@ xfs_sb_validate_fsb_count(
268 268
269#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ 269#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
270 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) 270 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
271 return E2BIG; 271 return EFBIG;
272#else /* Limited by UINT_MAX of sectors */ 272#else /* Limited by UINT_MAX of sectors */
273 if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) 273 if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
274 return E2BIG; 274 return EFBIG;
275#endif 275#endif
276 return 0; 276 return 0;
277} 277}
@@ -393,7 +393,7 @@ xfs_mount_validate_sb(
393 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 393 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
394 xfs_fs_mount_cmn_err(flags, 394 xfs_fs_mount_cmn_err(flags,
395 "file system too large to be mounted on this system."); 395 "file system too large to be mounted on this system.");
396 return XFS_ERROR(E2BIG); 396 return XFS_ERROR(EFBIG);
397 } 397 }
398 398
399 if (unlikely(sbp->sb_inprogress)) { 399 if (unlikely(sbp->sb_inprogress)) {
@@ -413,17 +413,6 @@ xfs_mount_validate_sb(
413 return 0; 413 return 0;
414} 414}
415 415
416STATIC void
417xfs_initialize_perag_icache(
418 xfs_perag_t *pag)
419{
420 if (!pag->pag_ici_init) {
421 rwlock_init(&pag->pag_ici_lock);
422 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
423 pag->pag_ici_init = 1;
424 }
425}
426
427int 416int
428xfs_initialize_perag( 417xfs_initialize_perag(
429 xfs_mount_t *mp, 418 xfs_mount_t *mp,
@@ -436,13 +425,8 @@ xfs_initialize_perag(
436 xfs_agino_t agino; 425 xfs_agino_t agino;
437 xfs_ino_t ino; 426 xfs_ino_t ino;
438 xfs_sb_t *sbp = &mp->m_sb; 427 xfs_sb_t *sbp = &mp->m_sb;
439 xfs_ino_t max_inum = XFS_MAXINUMBER_32;
440 int error = -ENOMEM; 428 int error = -ENOMEM;
441 429
442 /* Check to see if the filesystem can overflow 32 bit inodes */
443 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
444 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
445
446 /* 430 /*
447 * Walk the current per-ag tree so we don't try to initialise AGs 431 * Walk the current per-ag tree so we don't try to initialise AGs
448 * that already exist (growfs case). Allocate and insert all the 432 * that already exist (growfs case). Allocate and insert all the
@@ -456,11 +440,18 @@ xfs_initialize_perag(
456 } 440 }
457 if (!first_initialised) 441 if (!first_initialised)
458 first_initialised = index; 442 first_initialised = index;
443
459 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); 444 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
460 if (!pag) 445 if (!pag)
461 goto out_unwind; 446 goto out_unwind;
447 pag->pag_agno = index;
448 pag->pag_mount = mp;
449 rwlock_init(&pag->pag_ici_lock);
450 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
451
462 if (radix_tree_preload(GFP_NOFS)) 452 if (radix_tree_preload(GFP_NOFS))
463 goto out_unwind; 453 goto out_unwind;
454
464 spin_lock(&mp->m_perag_lock); 455 spin_lock(&mp->m_perag_lock);
465 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { 456 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
466 BUG(); 457 BUG();
@@ -469,25 +460,26 @@ xfs_initialize_perag(
469 error = -EEXIST; 460 error = -EEXIST;
470 goto out_unwind; 461 goto out_unwind;
471 } 462 }
472 pag->pag_agno = index;
473 pag->pag_mount = mp;
474 spin_unlock(&mp->m_perag_lock); 463 spin_unlock(&mp->m_perag_lock);
475 radix_tree_preload_end(); 464 radix_tree_preload_end();
476 } 465 }
477 466
478 /* Clear the mount flag if no inode can overflow 32 bits 467 /*
479 * on this filesystem, or if specifically requested.. 468 * If we mount with the inode64 option, or no inode overflows
469 * the legacy 32-bit address space clear the inode32 option.
480 */ 470 */
481 if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) { 471 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
472 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
473
474 if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
482 mp->m_flags |= XFS_MOUNT_32BITINODES; 475 mp->m_flags |= XFS_MOUNT_32BITINODES;
483 } else { 476 else
484 mp->m_flags &= ~XFS_MOUNT_32BITINODES; 477 mp->m_flags &= ~XFS_MOUNT_32BITINODES;
485 }
486 478
487 /* If we can overflow then setup the ag headers accordingly */
488 if (mp->m_flags & XFS_MOUNT_32BITINODES) { 479 if (mp->m_flags & XFS_MOUNT_32BITINODES) {
489 /* Calculate how much should be reserved for inodes to 480 /*
490 * meet the max inode percentage. 481 * Calculate how much should be reserved for inodes to meet
482 * the max inode percentage.
491 */ 483 */
492 if (mp->m_maxicount) { 484 if (mp->m_maxicount) {
493 __uint64_t icount; 485 __uint64_t icount;
@@ -500,30 +492,28 @@ xfs_initialize_perag(
500 } else { 492 } else {
501 max_metadata = agcount; 493 max_metadata = agcount;
502 } 494 }
495
503 for (index = 0; index < agcount; index++) { 496 for (index = 0; index < agcount; index++) {
504 ino = XFS_AGINO_TO_INO(mp, index, agino); 497 ino = XFS_AGINO_TO_INO(mp, index, agino);
505 if (ino > max_inum) { 498 if (ino > XFS_MAXINUMBER_32) {
506 index++; 499 index++;
507 break; 500 break;
508 } 501 }
509 502
510 /* This ag is preferred for inodes */
511 pag = xfs_perag_get(mp, index); 503 pag = xfs_perag_get(mp, index);
512 pag->pagi_inodeok = 1; 504 pag->pagi_inodeok = 1;
513 if (index < max_metadata) 505 if (index < max_metadata)
514 pag->pagf_metadata = 1; 506 pag->pagf_metadata = 1;
515 xfs_initialize_perag_icache(pag);
516 xfs_perag_put(pag); 507 xfs_perag_put(pag);
517 } 508 }
518 } else { 509 } else {
519 /* Setup default behavior for smaller filesystems */
520 for (index = 0; index < agcount; index++) { 510 for (index = 0; index < agcount; index++) {
521 pag = xfs_perag_get(mp, index); 511 pag = xfs_perag_get(mp, index);
522 pag->pagi_inodeok = 1; 512 pag->pagi_inodeok = 1;
523 xfs_initialize_perag_icache(pag);
524 xfs_perag_put(pag); 513 xfs_perag_put(pag);
525 } 514 }
526 } 515 }
516
527 if (maxagi) 517 if (maxagi)
528 *maxagi = index; 518 *maxagi = index;
529 return 0; 519 return 0;
@@ -1009,7 +999,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1009 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 999 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
1010 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1000 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
1011 cmn_err(CE_WARN, "XFS: size check 1 failed"); 1001 cmn_err(CE_WARN, "XFS: size check 1 failed");
1012 return XFS_ERROR(E2BIG); 1002 return XFS_ERROR(EFBIG);
1013 } 1003 }
1014 error = xfs_read_buf(mp, mp->m_ddev_targp, 1004 error = xfs_read_buf(mp, mp->m_ddev_targp,
1015 d - XFS_FSS_TO_BB(mp, 1), 1005 d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1009,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1019 } else { 1009 } else {
1020 cmn_err(CE_WARN, "XFS: size check 2 failed"); 1010 cmn_err(CE_WARN, "XFS: size check 2 failed");
1021 if (error == ENOSPC) 1011 if (error == ENOSPC)
1022 error = XFS_ERROR(E2BIG); 1012 error = XFS_ERROR(EFBIG);
1023 return error; 1013 return error;
1024 } 1014 }
1025 1015
@@ -1027,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1027 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1017 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1028 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1018 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1029 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1019 cmn_err(CE_WARN, "XFS: size check 3 failed");
1030 return XFS_ERROR(E2BIG); 1020 return XFS_ERROR(EFBIG);
1031 } 1021 }
1032 error = xfs_read_buf(mp, mp->m_logdev_targp, 1022 error = xfs_read_buf(mp, mp->m_logdev_targp,
1033 d - XFS_FSB_TO_BB(mp, 1), 1023 d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1027,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1037 } else { 1027 } else {
1038 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1028 cmn_err(CE_WARN, "XFS: size check 3 failed");
1039 if (error == ENOSPC) 1029 if (error == ENOSPC)
1040 error = XFS_ERROR(E2BIG); 1030 error = XFS_ERROR(EFBIG);
1041 return error; 1031 return error;
1042 } 1032 }
1043 } 1033 }
@@ -1254,7 +1244,7 @@ xfs_mountfs(
1254 * Allocate and initialize the per-ag data. 1244 * Allocate and initialize the per-ag data.
1255 */ 1245 */
1256 spin_lock_init(&mp->m_perag_lock); 1246 spin_lock_init(&mp->m_perag_lock);
1257 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS); 1247 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
1258 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); 1248 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1259 if (error) { 1249 if (error) {
1260 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); 1250 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d59..16445518506d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2247,7 +2247,7 @@ xfs_rtmount_init(
2247 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", 2247 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
2248 (unsigned long long) XFS_BB_TO_FSB(mp, d), 2248 (unsigned long long) XFS_BB_TO_FSB(mp, d),
2249 (unsigned long long) mp->m_sb.sb_rblocks); 2249 (unsigned long long) mp->m_sb.sb_rblocks);
2250 return XFS_ERROR(E2BIG); 2250 return XFS_ERROR(EFBIG);
2251 } 2251 }
2252 error = xfs_read_buf(mp, mp->m_rtdev_targp, 2252 error = xfs_read_buf(mp, mp->m_rtdev_targp,
2253 d - XFS_FSB_TO_BB(mp, 1), 2253 d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2256,7 @@ xfs_rtmount_init(
2256 cmn_err(CE_WARN, 2256 cmn_err(CE_WARN,
2257 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); 2257 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
2258 if (error == ENOSPC) 2258 if (error == ENOSPC)
2259 return XFS_ERROR(E2BIG); 2259 return XFS_ERROR(EFBIG);
2260 return error; 2260 return error;
2261 } 2261 }
2262 xfs_buf_relse(bp); 2262 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a08..ff614c29b441 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
147# define xfs_rtfree_extent(t,b,l) (ENOSYS) 147# define xfs_rtfree_extent(t,b,l) (ENOSYS)
148# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) 148# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
149# define xfs_growfs_rt(mp,in) (ENOSYS) 149# define xfs_growfs_rt(mp,in) (ENOSYS)
150# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 150static inline int /* error */
151xfs_rtmount_init(
152 xfs_mount_t *mp) /* file system mount structure */
153{
154 if (mp->m_sb.sb_rblocks == 0)
155 return 0;
156
157 cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
158 return ENOSYS;
159}
151# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
152# define xfs_rtunmount_inodes(m) 161# define xfs_rtunmount_inodes(m)
153#endif /* CONFIG_XFS_RT */ 162#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ce558efa2ea0..28547dfce037 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -48,134 +48,489 @@
48 48
49kmem_zone_t *xfs_trans_zone; 49kmem_zone_t *xfs_trans_zone;
50 50
51
51/* 52/*
52 * Reservation functions here avoid a huge stack in xfs_trans_init 53 * Various log reservation values.
53 * due to register overflow from temporaries in the calculations. 54 *
55 * These are based on the size of the file system block because that is what
56 * most transactions manipulate. Each adds in an additional 128 bytes per
57 * item logged to try to account for the overhead of the transaction mechanism.
58 *
59 * Note: Most of the reservations underestimate the number of allocation
60 * groups into which they could free extents in the xfs_bmap_finish() call.
61 * This is because the number in the worst case is quite high and quite
62 * unusual. In order to fix this we need to change xfs_bmap_finish() to free
63 * extents in only a single AG at a time. This will require changes to the
64 * EFI code as well, however, so that the EFI for the extents not freed is
65 * logged again in each transaction. See SGI PV #261917.
66 *
67 * Reservation functions here avoid a huge stack in xfs_trans_init due to
68 * register overflow from temporaries in the calculations.
69 */
70
71
72/*
73 * In a write transaction we can allocate a maximum of 2
74 * extents. This gives:
75 * the inode getting the new extents: inode size
76 * the inode's bmap btree: max depth * block size
77 * the agfs of the ags from which the extents are allocated: 2 * sector
78 * the superblock free block counter: sector size
79 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
80 * And the bmap_finish transaction can free bmap blocks in a join:
81 * the agfs of the ags containing the blocks: 2 * sector size
82 * the agfls of the ags containing the blocks: 2 * sector size
83 * the super block free block counter: sector size
84 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
54 */ 85 */
55STATIC uint 86STATIC uint
56xfs_calc_write_reservation(xfs_mount_t *mp) 87xfs_calc_write_reservation(
88 struct xfs_mount *mp)
57{ 89{
58 return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 90 return XFS_DQUOT_LOGRES(mp) +
91 MAX((mp->m_sb.sb_inodesize +
92 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
93 2 * mp->m_sb.sb_sectsize +
94 mp->m_sb.sb_sectsize +
95 XFS_ALLOCFREE_LOG_RES(mp, 2) +
96 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
97 XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
98 (2 * mp->m_sb.sb_sectsize +
99 2 * mp->m_sb.sb_sectsize +
100 mp->m_sb.sb_sectsize +
101 XFS_ALLOCFREE_LOG_RES(mp, 2) +
102 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
59} 103}
60 104
105/*
106 * In truncating a file we free up to two extents at once. We can modify:
107 * the inode being truncated: inode size
108 * the inode's bmap btree: (max depth + 1) * block size
109 * And the bmap_finish transaction can free the blocks and bmap blocks:
110 * the agf for each of the ags: 4 * sector size
111 * the agfl for each of the ags: 4 * sector size
112 * the super block to reflect the freed blocks: sector size
113 * worst case split in allocation btrees per extent assuming 4 extents:
114 * 4 exts * 2 trees * (2 * max depth - 1) * block size
115 * the inode btree: max depth * blocksize
116 * the allocation btrees: 2 trees * (max depth - 1) * block size
117 */
61STATIC uint 118STATIC uint
62xfs_calc_itruncate_reservation(xfs_mount_t *mp) 119xfs_calc_itruncate_reservation(
120 struct xfs_mount *mp)
63{ 121{
64 return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 122 return XFS_DQUOT_LOGRES(mp) +
123 MAX((mp->m_sb.sb_inodesize +
124 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
125 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
126 (4 * mp->m_sb.sb_sectsize +
127 4 * mp->m_sb.sb_sectsize +
128 mp->m_sb.sb_sectsize +
129 XFS_ALLOCFREE_LOG_RES(mp, 4) +
130 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
131 128 * 5 +
132 XFS_ALLOCFREE_LOG_RES(mp, 1) +
133 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
134 XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
65} 135}
66 136
137/*
138 * In renaming a files we can modify:
139 * the four inodes involved: 4 * inode size
140 * the two directory btrees: 2 * (max depth + v2) * dir block size
141 * the two directory bmap btrees: 2 * max depth * block size
142 * And the bmap_finish transaction can free dir and bmap blocks (two sets
143 * of bmap blocks) giving:
144 * the agf for the ags in which the blocks live: 3 * sector size
145 * the agfl for the ags in which the blocks live: 3 * sector size
146 * the superblock for the free block count: sector size
147 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
148 */
67STATIC uint 149STATIC uint
68xfs_calc_rename_reservation(xfs_mount_t *mp) 150xfs_calc_rename_reservation(
151 struct xfs_mount *mp)
69{ 152{
70 return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 153 return XFS_DQUOT_LOGRES(mp) +
154 MAX((4 * mp->m_sb.sb_inodesize +
155 2 * XFS_DIROP_LOG_RES(mp) +
156 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
157 (3 * mp->m_sb.sb_sectsize +
158 3 * mp->m_sb.sb_sectsize +
159 mp->m_sb.sb_sectsize +
160 XFS_ALLOCFREE_LOG_RES(mp, 3) +
161 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
71} 162}
72 163
164/*
165 * For creating a link to an inode:
166 * the parent directory inode: inode size
167 * the linked inode: inode size
168 * the directory btree could split: (max depth + v2) * dir block size
169 * the directory bmap btree could join or split: (max depth + v2) * blocksize
170 * And the bmap_finish transaction can free some bmap blocks giving:
171 * the agf for the ag in which the blocks live: sector size
172 * the agfl for the ag in which the blocks live: sector size
173 * the superblock for the free block count: sector size
174 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
175 */
73STATIC uint 176STATIC uint
74xfs_calc_link_reservation(xfs_mount_t *mp) 177xfs_calc_link_reservation(
178 struct xfs_mount *mp)
75{ 179{
76 return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 180 return XFS_DQUOT_LOGRES(mp) +
181 MAX((mp->m_sb.sb_inodesize +
182 mp->m_sb.sb_inodesize +
183 XFS_DIROP_LOG_RES(mp) +
184 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
185 (mp->m_sb.sb_sectsize +
186 mp->m_sb.sb_sectsize +
187 mp->m_sb.sb_sectsize +
188 XFS_ALLOCFREE_LOG_RES(mp, 1) +
189 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
77} 190}
78 191
192/*
193 * For removing a directory entry we can modify:
194 * the parent directory inode: inode size
195 * the removed inode: inode size
196 * the directory btree could join: (max depth + v2) * dir block size
197 * the directory bmap btree could join or split: (max depth + v2) * blocksize
198 * And the bmap_finish transaction can free the dir and bmap blocks giving:
199 * the agf for the ag in which the blocks live: 2 * sector size
200 * the agfl for the ag in which the blocks live: 2 * sector size
201 * the superblock for the free block count: sector size
202 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
203 */
79STATIC uint 204STATIC uint
80xfs_calc_remove_reservation(xfs_mount_t *mp) 205xfs_calc_remove_reservation(
206 struct xfs_mount *mp)
81{ 207{
82 return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 208 return XFS_DQUOT_LOGRES(mp) +
209 MAX((mp->m_sb.sb_inodesize +
210 mp->m_sb.sb_inodesize +
211 XFS_DIROP_LOG_RES(mp) +
212 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
213 (2 * mp->m_sb.sb_sectsize +
214 2 * mp->m_sb.sb_sectsize +
215 mp->m_sb.sb_sectsize +
216 XFS_ALLOCFREE_LOG_RES(mp, 2) +
217 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
83} 218}
84 219
220/*
221 * For symlink we can modify:
222 * the parent directory inode: inode size
223 * the new inode: inode size
224 * the inode btree entry: 1 block
225 * the directory btree: (max depth + v2) * dir block size
226 * the directory inode's bmap btree: (max depth + v2) * block size
227 * the blocks for the symlink: 1 kB
228 * Or in the first xact we allocate some inodes giving:
229 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
230 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
231 * the inode btree: max depth * blocksize
232 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
233 */
85STATIC uint 234STATIC uint
86xfs_calc_symlink_reservation(xfs_mount_t *mp) 235xfs_calc_symlink_reservation(
236 struct xfs_mount *mp)
87{ 237{
88 return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 238 return XFS_DQUOT_LOGRES(mp) +
239 MAX((mp->m_sb.sb_inodesize +
240 mp->m_sb.sb_inodesize +
241 XFS_FSB_TO_B(mp, 1) +
242 XFS_DIROP_LOG_RES(mp) +
243 1024 +
244 128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
245 (2 * mp->m_sb.sb_sectsize +
246 XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
247 XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
248 XFS_ALLOCFREE_LOG_RES(mp, 1) +
249 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
250 XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
89} 251}
90 252
253/*
254 * For create we can modify:
255 * the parent directory inode: inode size
256 * the new inode: inode size
257 * the inode btree entry: block size
258 * the superblock for the nlink flag: sector size
259 * the directory btree: (max depth + v2) * dir block size
260 * the directory inode's bmap btree: (max depth + v2) * block size
261 * Or in the first xact we allocate some inodes giving:
262 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
263 * the superblock for the nlink flag: sector size
264 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
265 * the inode btree: max depth * blocksize
266 * the allocation btrees: 2 trees * (max depth - 1) * block size
267 */
91STATIC uint 268STATIC uint
92xfs_calc_create_reservation(xfs_mount_t *mp) 269xfs_calc_create_reservation(
270 struct xfs_mount *mp)
93{ 271{
94 return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 272 return XFS_DQUOT_LOGRES(mp) +
273 MAX((mp->m_sb.sb_inodesize +
274 mp->m_sb.sb_inodesize +
275 mp->m_sb.sb_sectsize +
276 XFS_FSB_TO_B(mp, 1) +
277 XFS_DIROP_LOG_RES(mp) +
278 128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
279 (3 * mp->m_sb.sb_sectsize +
280 XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
281 XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
282 XFS_ALLOCFREE_LOG_RES(mp, 1) +
283 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
284 XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
95} 285}
96 286
287/*
288 * Making a new directory is the same as creating a new file.
289 */
97STATIC uint 290STATIC uint
98xfs_calc_mkdir_reservation(xfs_mount_t *mp) 291xfs_calc_mkdir_reservation(
292 struct xfs_mount *mp)
99{ 293{
100 return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 294 return xfs_calc_create_reservation(mp);
101} 295}
102 296
297/*
298 * In freeing an inode we can modify:
299 * the inode being freed: inode size
300 * the super block free inode counter: sector size
301 * the agi hash list and counters: sector size
302 * the inode btree entry: block size
303 * the on disk inode before ours in the agi hash list: inode cluster size
304 * the inode btree: max depth * blocksize
305 * the allocation btrees: 2 trees * (max depth - 1) * block size
306 */
103STATIC uint 307STATIC uint
104xfs_calc_ifree_reservation(xfs_mount_t *mp) 308xfs_calc_ifree_reservation(
309 struct xfs_mount *mp)
105{ 310{
106 return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 311 return XFS_DQUOT_LOGRES(mp) +
312 mp->m_sb.sb_inodesize +
313 mp->m_sb.sb_sectsize +
314 mp->m_sb.sb_sectsize +
315 XFS_FSB_TO_B(mp, 1) +
316 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
317 XFS_INODE_CLUSTER_SIZE(mp)) +
318 128 * 5 +
319 XFS_ALLOCFREE_LOG_RES(mp, 1) +
320 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
321 XFS_ALLOCFREE_LOG_COUNT(mp, 1));
107} 322}
108 323
324/*
325 * When only changing the inode we log the inode and possibly the superblock
326 * We also add a bit of slop for the transaction stuff.
327 */
109STATIC uint 328STATIC uint
110xfs_calc_ichange_reservation(xfs_mount_t *mp) 329xfs_calc_ichange_reservation(
330 struct xfs_mount *mp)
111{ 331{
112 return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 332 return XFS_DQUOT_LOGRES(mp) +
333 mp->m_sb.sb_inodesize +
334 mp->m_sb.sb_sectsize +
335 512;
336
113} 337}
114 338
339/*
340 * Growing the data section of the filesystem.
341 * superblock
342 * agi and agf
343 * allocation btrees
344 */
115STATIC uint 345STATIC uint
116xfs_calc_growdata_reservation(xfs_mount_t *mp) 346xfs_calc_growdata_reservation(
347 struct xfs_mount *mp)
117{ 348{
118 return XFS_CALC_GROWDATA_LOG_RES(mp); 349 return mp->m_sb.sb_sectsize * 3 +
350 XFS_ALLOCFREE_LOG_RES(mp, 1) +
351 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
119} 352}
120 353
354/*
355 * Growing the rt section of the filesystem.
356 * In the first set of transactions (ALLOC) we allocate space to the
357 * bitmap or summary files.
358 * superblock: sector size
359 * agf of the ag from which the extent is allocated: sector size
360 * bmap btree for bitmap/summary inode: max depth * blocksize
361 * bitmap/summary inode: inode size
362 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
363 */
121STATIC uint 364STATIC uint
122xfs_calc_growrtalloc_reservation(xfs_mount_t *mp) 365xfs_calc_growrtalloc_reservation(
366 struct xfs_mount *mp)
123{ 367{
124 return XFS_CALC_GROWRTALLOC_LOG_RES(mp); 368 return 2 * mp->m_sb.sb_sectsize +
369 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
370 mp->m_sb.sb_inodesize +
371 XFS_ALLOCFREE_LOG_RES(mp, 1) +
372 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
373 XFS_ALLOCFREE_LOG_COUNT(mp, 1));
125} 374}
126 375
376/*
377 * Growing the rt section of the filesystem.
378 * In the second set of transactions (ZERO) we zero the new metadata blocks.
379 * one bitmap/summary block: blocksize
380 */
127STATIC uint 381STATIC uint
128xfs_calc_growrtzero_reservation(xfs_mount_t *mp) 382xfs_calc_growrtzero_reservation(
383 struct xfs_mount *mp)
129{ 384{
130 return XFS_CALC_GROWRTZERO_LOG_RES(mp); 385 return mp->m_sb.sb_blocksize + 128;
131} 386}
132 387
388/*
389 * Growing the rt section of the filesystem.
390 * In the third set of transactions (FREE) we update metadata without
391 * allocating any new blocks.
392 * superblock: sector size
393 * bitmap inode: inode size
394 * summary inode: inode size
395 * one bitmap block: blocksize
396 * summary blocks: new summary size
397 */
133STATIC uint 398STATIC uint
134xfs_calc_growrtfree_reservation(xfs_mount_t *mp) 399xfs_calc_growrtfree_reservation(
400 struct xfs_mount *mp)
135{ 401{
136 return XFS_CALC_GROWRTFREE_LOG_RES(mp); 402 return mp->m_sb.sb_sectsize +
403 2 * mp->m_sb.sb_inodesize +
404 mp->m_sb.sb_blocksize +
405 mp->m_rsumsize +
406 128 * 5;
137} 407}
138 408
409/*
410 * Logging the inode modification timestamp on a synchronous write.
411 * inode
412 */
139STATIC uint 413STATIC uint
140xfs_calc_swrite_reservation(xfs_mount_t *mp) 414xfs_calc_swrite_reservation(
415 struct xfs_mount *mp)
141{ 416{
142 return XFS_CALC_SWRITE_LOG_RES(mp); 417 return mp->m_sb.sb_inodesize + 128;
143} 418}
144 419
420/*
421 * Logging the inode mode bits when writing a setuid/setgid file
422 * inode
423 */
145STATIC uint 424STATIC uint
146xfs_calc_writeid_reservation(xfs_mount_t *mp) 425xfs_calc_writeid_reservation(xfs_mount_t *mp)
147{ 426{
148 return XFS_CALC_WRITEID_LOG_RES(mp); 427 return mp->m_sb.sb_inodesize + 128;
149} 428}
150 429
430/*
431 * Converting the inode from non-attributed to attributed.
432 * the inode being converted: inode size
433 * agf block and superblock (for block allocation)
434 * the new block (directory sized)
435 * bmap blocks for the new directory block
436 * allocation btrees
437 */
151STATIC uint 438STATIC uint
152xfs_calc_addafork_reservation(xfs_mount_t *mp) 439xfs_calc_addafork_reservation(
440 struct xfs_mount *mp)
153{ 441{
154 return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 442 return XFS_DQUOT_LOGRES(mp) +
443 mp->m_sb.sb_inodesize +
444 mp->m_sb.sb_sectsize * 2 +
445 mp->m_dirblksize +
446 XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
447 XFS_ALLOCFREE_LOG_RES(mp, 1) +
448 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
449 XFS_ALLOCFREE_LOG_COUNT(mp, 1));
155} 450}
156 451
452/*
453 * Removing the attribute fork of a file
454 * the inode being truncated: inode size
455 * the inode's bmap btree: max depth * block size
456 * And the bmap_finish transaction can free the blocks and bmap blocks:
457 * the agf for each of the ags: 4 * sector size
458 * the agfl for each of the ags: 4 * sector size
459 * the super block to reflect the freed blocks: sector size
460 * worst case split in allocation btrees per extent assuming 4 extents:
461 * 4 exts * 2 trees * (2 * max depth - 1) * block size
462 */
157STATIC uint 463STATIC uint
158xfs_calc_attrinval_reservation(xfs_mount_t *mp) 464xfs_calc_attrinval_reservation(
465 struct xfs_mount *mp)
159{ 466{
160 return XFS_CALC_ATTRINVAL_LOG_RES(mp); 467 return MAX((mp->m_sb.sb_inodesize +
468 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
469 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
470 (4 * mp->m_sb.sb_sectsize +
471 4 * mp->m_sb.sb_sectsize +
472 mp->m_sb.sb_sectsize +
473 XFS_ALLOCFREE_LOG_RES(mp, 4) +
474 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
161} 475}
162 476
477/*
478 * Setting an attribute.
479 * the inode getting the attribute
480 * the superblock for allocations
481 * the agfs extents are allocated from
482 * the attribute btree * max depth
483 * the inode allocation btree
484 * Since attribute transaction space is dependent on the size of the attribute,
485 * the calculation is done partially at mount time and partially at runtime.
486 */
163STATIC uint 487STATIC uint
164xfs_calc_attrset_reservation(xfs_mount_t *mp) 488xfs_calc_attrset_reservation(
489 struct xfs_mount *mp)
165{ 490{
166 return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 491 return XFS_DQUOT_LOGRES(mp) +
492 mp->m_sb.sb_inodesize +
493 mp->m_sb.sb_sectsize +
494 XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
495 128 * (2 + XFS_DA_NODE_MAXDEPTH);
167} 496}
168 497
498/*
499 * Removing an attribute.
500 * the inode: inode size
501 * the attribute btree could join: max depth * block size
502 * the inode bmap btree could join or split: max depth * block size
503 * And the bmap_finish transaction can free the attr blocks freed giving:
504 * the agf for the ag in which the blocks live: 2 * sector size
505 * the agfl for the ag in which the blocks live: 2 * sector size
506 * the superblock for the free block count: sector size
507 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
508 */
169STATIC uint 509STATIC uint
170xfs_calc_attrrm_reservation(xfs_mount_t *mp) 510xfs_calc_attrrm_reservation(
511 struct xfs_mount *mp)
171{ 512{
172 return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 513 return XFS_DQUOT_LOGRES(mp) +
514 MAX((mp->m_sb.sb_inodesize +
515 XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
516 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
517 128 * (1 + XFS_DA_NODE_MAXDEPTH +
518 XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
519 (2 * mp->m_sb.sb_sectsize +
520 2 * mp->m_sb.sb_sectsize +
521 mp->m_sb.sb_sectsize +
522 XFS_ALLOCFREE_LOG_RES(mp, 2) +
523 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
173} 524}
174 525
526/*
527 * Clearing a bad agino number in an agi hash bucket.
528 */
175STATIC uint 529STATIC uint
176xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp) 530xfs_calc_clear_agi_bucket_reservation(
531 struct xfs_mount *mp)
177{ 532{
178 return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp); 533 return mp->m_sb.sb_sectsize + 128;
179} 534}
180 535
181/* 536/*
@@ -184,11 +539,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
184 */ 539 */
185void 540void
186xfs_trans_init( 541xfs_trans_init(
187 xfs_mount_t *mp) 542 struct xfs_mount *mp)
188{ 543{
189 xfs_trans_reservations_t *resp; 544 struct xfs_trans_reservations *resp = &mp->m_reservations;
190 545
191 resp = &(mp->m_reservations);
192 resp->tr_write = xfs_calc_write_reservation(mp); 546 resp->tr_write = xfs_calc_write_reservation(mp);
193 resp->tr_itruncate = xfs_calc_itruncate_reservation(mp); 547 resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
194 resp->tr_rename = xfs_calc_rename_reservation(mp); 548 resp->tr_rename = xfs_calc_rename_reservation(mp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8c69e7824f68..e639e8e9a2a9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -300,24 +300,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
300 300
301 301
302/* 302/*
303 * Various log reservation values.
304 * These are based on the size of the file system block
305 * because that is what most transactions manipulate.
306 * Each adds in an additional 128 bytes per item logged to
307 * try to account for the overhead of the transaction mechanism.
308 *
309 * Note:
310 * Most of the reservations underestimate the number of allocation
311 * groups into which they could free extents in the xfs_bmap_finish()
312 * call. This is because the number in the worst case is quite high
313 * and quite unusual. In order to fix this we need to change
314 * xfs_bmap_finish() to free extents in only a single AG at a time.
315 * This will require changes to the EFI code as well, however, so that
316 * the EFI for the extents not freed is logged again in each transaction.
317 * See bug 261917.
318 */
319
320/*
321 * Per-extent log reservation for the allocation btree changes 303 * Per-extent log reservation for the allocation btree changes
322 * involved in freeing or allocating an extent. 304 * involved in freeing or allocating an extent.
323 * 2 trees * (2 blocks/level * max depth - 1) * block size 305 * 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -341,429 +323,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
341 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ 323 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
342 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) 324 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
343 325
344/*
345 * In a write transaction we can allocate a maximum of 2
346 * extents. This gives:
347 * the inode getting the new extents: inode size
348 * the inode's bmap btree: max depth * block size
349 * the agfs of the ags from which the extents are allocated: 2 * sector
350 * the superblock free block counter: sector size
351 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
352 * And the bmap_finish transaction can free bmap blocks in a join:
353 * the agfs of the ags containing the blocks: 2 * sector size
354 * the agfls of the ags containing the blocks: 2 * sector size
355 * the super block free block counter: sector size
356 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
357 */
358#define XFS_CALC_WRITE_LOG_RES(mp) \
359 (MAX( \
360 ((mp)->m_sb.sb_inodesize + \
361 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
362 (2 * (mp)->m_sb.sb_sectsize) + \
363 (mp)->m_sb.sb_sectsize + \
364 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
365 (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
366 ((2 * (mp)->m_sb.sb_sectsize) + \
367 (2 * (mp)->m_sb.sb_sectsize) + \
368 (mp)->m_sb.sb_sectsize + \
369 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
370 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
371 326
372#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) 327#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write)
373
374/*
375 * In truncating a file we free up to two extents at once. We can modify:
376 * the inode being truncated: inode size
377 * the inode's bmap btree: (max depth + 1) * block size
378 * And the bmap_finish transaction can free the blocks and bmap blocks:
379 * the agf for each of the ags: 4 * sector size
380 * the agfl for each of the ags: 4 * sector size
381 * the super block to reflect the freed blocks: sector size
382 * worst case split in allocation btrees per extent assuming 4 extents:
383 * 4 exts * 2 trees * (2 * max depth - 1) * block size
384 * the inode btree: max depth * blocksize
385 * the allocation btrees: 2 trees * (max depth - 1) * block size
386 */
387#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
388 (MAX( \
389 ((mp)->m_sb.sb_inodesize + \
390 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
391 (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
392 ((4 * (mp)->m_sb.sb_sectsize) + \
393 (4 * (mp)->m_sb.sb_sectsize) + \
394 (mp)->m_sb.sb_sectsize + \
395 XFS_ALLOCFREE_LOG_RES(mp, 4) + \
396 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
397 (128 * 5) + \
398 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
399 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
400 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
401
402#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) 328#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
403
404/*
405 * In renaming a files we can modify:
406 * the four inodes involved: 4 * inode size
407 * the two directory btrees: 2 * (max depth + v2) * dir block size
408 * the two directory bmap btrees: 2 * max depth * block size
409 * And the bmap_finish transaction can free dir and bmap blocks (two sets
410 * of bmap blocks) giving:
411 * the agf for the ags in which the blocks live: 3 * sector size
412 * the agfl for the ags in which the blocks live: 3 * sector size
413 * the superblock for the free block count: sector size
414 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
415 */
416#define XFS_CALC_RENAME_LOG_RES(mp) \
417 (MAX( \
418 ((4 * (mp)->m_sb.sb_inodesize) + \
419 (2 * XFS_DIROP_LOG_RES(mp)) + \
420 (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
421 ((3 * (mp)->m_sb.sb_sectsize) + \
422 (3 * (mp)->m_sb.sb_sectsize) + \
423 (mp)->m_sb.sb_sectsize + \
424 XFS_ALLOCFREE_LOG_RES(mp, 3) + \
425 (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
426
427#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) 329#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
428
429/*
430 * For creating a link to an inode:
431 * the parent directory inode: inode size
432 * the linked inode: inode size
433 * the directory btree could split: (max depth + v2) * dir block size
434 * the directory bmap btree could join or split: (max depth + v2) * blocksize
435 * And the bmap_finish transaction can free some bmap blocks giving:
436 * the agf for the ag in which the blocks live: sector size
437 * the agfl for the ag in which the blocks live: sector size
438 * the superblock for the free block count: sector size
439 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
440 */
441#define XFS_CALC_LINK_LOG_RES(mp) \
442 (MAX( \
443 ((mp)->m_sb.sb_inodesize + \
444 (mp)->m_sb.sb_inodesize + \
445 XFS_DIROP_LOG_RES(mp) + \
446 (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
447 ((mp)->m_sb.sb_sectsize + \
448 (mp)->m_sb.sb_sectsize + \
449 (mp)->m_sb.sb_sectsize + \
450 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
451 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
452
453#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) 330#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
454
455/*
456 * For removing a directory entry we can modify:
457 * the parent directory inode: inode size
458 * the removed inode: inode size
459 * the directory btree could join: (max depth + v2) * dir block size
460 * the directory bmap btree could join or split: (max depth + v2) * blocksize
461 * And the bmap_finish transaction can free the dir and bmap blocks giving:
462 * the agf for the ag in which the blocks live: 2 * sector size
463 * the agfl for the ag in which the blocks live: 2 * sector size
464 * the superblock for the free block count: sector size
465 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
466 */
467#define XFS_CALC_REMOVE_LOG_RES(mp) \
468 (MAX( \
469 ((mp)->m_sb.sb_inodesize + \
470 (mp)->m_sb.sb_inodesize + \
471 XFS_DIROP_LOG_RES(mp) + \
472 (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
473 ((2 * (mp)->m_sb.sb_sectsize) + \
474 (2 * (mp)->m_sb.sb_sectsize) + \
475 (mp)->m_sb.sb_sectsize + \
476 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
477 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
478
479#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) 331#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
480
481/*
482 * For symlink we can modify:
483 * the parent directory inode: inode size
484 * the new inode: inode size
485 * the inode btree entry: 1 block
486 * the directory btree: (max depth + v2) * dir block size
487 * the directory inode's bmap btree: (max depth + v2) * block size
488 * the blocks for the symlink: 1 kB
489 * Or in the first xact we allocate some inodes giving:
490 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
491 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
492 * the inode btree: max depth * blocksize
493 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
494 */
495#define XFS_CALC_SYMLINK_LOG_RES(mp) \
496 (MAX( \
497 ((mp)->m_sb.sb_inodesize + \
498 (mp)->m_sb.sb_inodesize + \
499 XFS_FSB_TO_B(mp, 1) + \
500 XFS_DIROP_LOG_RES(mp) + \
501 1024 + \
502 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
503 (2 * (mp)->m_sb.sb_sectsize + \
504 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
505 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
506 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
507 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
508 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
509
510#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) 332#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
511
512/*
513 * For create we can modify:
514 * the parent directory inode: inode size
515 * the new inode: inode size
516 * the inode btree entry: block size
517 * the superblock for the nlink flag: sector size
518 * the directory btree: (max depth + v2) * dir block size
519 * the directory inode's bmap btree: (max depth + v2) * block size
520 * Or in the first xact we allocate some inodes giving:
521 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
522 * the superblock for the nlink flag: sector size
523 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
524 * the inode btree: max depth * blocksize
525 * the allocation btrees: 2 trees * (max depth - 1) * block size
526 */
527#define XFS_CALC_CREATE_LOG_RES(mp) \
528 (MAX( \
529 ((mp)->m_sb.sb_inodesize + \
530 (mp)->m_sb.sb_inodesize + \
531 (mp)->m_sb.sb_sectsize + \
532 XFS_FSB_TO_B(mp, 1) + \
533 XFS_DIROP_LOG_RES(mp) + \
534 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
535 (3 * (mp)->m_sb.sb_sectsize + \
536 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
537 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
538 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
539 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
540 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
541
542#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) 333#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
543
544/*
545 * Making a new directory is the same as creating a new file.
546 */
547#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp)
548
549#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) 334#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
550
551/*
552 * In freeing an inode we can modify:
553 * the inode being freed: inode size
554 * the super block free inode counter: sector size
555 * the agi hash list and counters: sector size
556 * the inode btree entry: block size
557 * the on disk inode before ours in the agi hash list: inode cluster size
558 * the inode btree: max depth * blocksize
559 * the allocation btrees: 2 trees * (max depth - 1) * block size
560 */
561#define XFS_CALC_IFREE_LOG_RES(mp) \
562 ((mp)->m_sb.sb_inodesize + \
563 (mp)->m_sb.sb_sectsize + \
564 (mp)->m_sb.sb_sectsize + \
565 XFS_FSB_TO_B((mp), 1) + \
566 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
567 (128 * 5) + \
568 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
569 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
570 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
571
572
573#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) 335#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
574
575/*
576 * When only changing the inode we log the inode and possibly the superblock
577 * We also add a bit of slop for the transaction stuff.
578 */
579#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \
580 (mp)->m_sb.sb_sectsize + 512)
581
582#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) 336#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
583
584/*
585 * Growing the data section of the filesystem.
586 * superblock
587 * agi and agf
588 * allocation btrees
589 */
590#define XFS_CALC_GROWDATA_LOG_RES(mp) \
591 ((mp)->m_sb.sb_sectsize * 3 + \
592 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
593 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
594
595#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) 337#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
596
597/*
598 * Growing the rt section of the filesystem.
599 * In the first set of transactions (ALLOC) we allocate space to the
600 * bitmap or summary files.
601 * superblock: sector size
602 * agf of the ag from which the extent is allocated: sector size
603 * bmap btree for bitmap/summary inode: max depth * blocksize
604 * bitmap/summary inode: inode size
605 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
606 */
607#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
608 (2 * (mp)->m_sb.sb_sectsize + \
609 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
610 (mp)->m_sb.sb_inodesize + \
611 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
612 (128 * \
613 (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
614 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
615
616#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) 338#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
617
618/*
619 * Growing the rt section of the filesystem.
620 * In the second set of transactions (ZERO) we zero the new metadata blocks.
621 * one bitmap/summary block: blocksize
622 */
623#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
624 ((mp)->m_sb.sb_blocksize + 128)
625
626#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) 339#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
627
628/*
629 * Growing the rt section of the filesystem.
630 * In the third set of transactions (FREE) we update metadata without
631 * allocating any new blocks.
632 * superblock: sector size
633 * bitmap inode: inode size
634 * summary inode: inode size
635 * one bitmap block: blocksize
636 * summary blocks: new summary size
637 */
638#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
639 ((mp)->m_sb.sb_sectsize + \
640 2 * (mp)->m_sb.sb_inodesize + \
641 (mp)->m_sb.sb_blocksize + \
642 (mp)->m_rsumsize + \
643 (128 * 5))
644
645#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) 340#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
646
647/*
648 * Logging the inode modification timestamp on a synchronous write.
649 * inode
650 */
651#define XFS_CALC_SWRITE_LOG_RES(mp) \
652 ((mp)->m_sb.sb_inodesize + 128)
653
654#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) 341#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
655
656/* 342/*
657 * Logging the inode timestamps on an fsync -- same as SWRITE 343 * Logging the inode timestamps on an fsync -- same as SWRITE
658 * as long as SWRITE logs the entire inode core 344 * as long as SWRITE logs the entire inode core
659 */ 345 */
660#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) 346#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
661
662/*
663 * Logging the inode mode bits when writing a setuid/setgid file
664 * inode
665 */
666#define XFS_CALC_WRITEID_LOG_RES(mp) \
667 ((mp)->m_sb.sb_inodesize + 128)
668
669#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) 347#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
670
671/*
672 * Converting the inode from non-attributed to attributed.
673 * the inode being converted: inode size
674 * agf block and superblock (for block allocation)
675 * the new block (directory sized)
676 * bmap blocks for the new directory block
677 * allocation btrees
678 */
679#define XFS_CALC_ADDAFORK_LOG_RES(mp) \
680 ((mp)->m_sb.sb_inodesize + \
681 (mp)->m_sb.sb_sectsize * 2 + \
682 (mp)->m_dirblksize + \
683 XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
684 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
685 (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
686 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
687
688#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) 348#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
689
690/*
691 * Removing the attribute fork of a file
692 * the inode being truncated: inode size
693 * the inode's bmap btree: max depth * block size
694 * And the bmap_finish transaction can free the blocks and bmap blocks:
695 * the agf for each of the ags: 4 * sector size
696 * the agfl for each of the ags: 4 * sector size
697 * the super block to reflect the freed blocks: sector size
698 * worst case split in allocation btrees per extent assuming 4 extents:
699 * 4 exts * 2 trees * (2 * max depth - 1) * block size
700 */
701#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \
702 (MAX( \
703 ((mp)->m_sb.sb_inodesize + \
704 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
705 (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
706 ((4 * (mp)->m_sb.sb_sectsize) + \
707 (4 * (mp)->m_sb.sb_sectsize) + \
708 (mp)->m_sb.sb_sectsize + \
709 XFS_ALLOCFREE_LOG_RES(mp, 4) + \
710 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
711
712#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) 349#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
713
714/*
715 * Setting an attribute.
716 * the inode getting the attribute
717 * the superblock for allocations
718 * the agfs extents are allocated from
719 * the attribute btree * max depth
720 * the inode allocation btree
721 * Since attribute transaction space is dependent on the size of the attribute,
722 * the calculation is done partially at mount time and partially at runtime.
723 */
724#define XFS_CALC_ATTRSET_LOG_RES(mp) \
725 ((mp)->m_sb.sb_inodesize + \
726 (mp)->m_sb.sb_sectsize + \
727 XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
728 (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
729
730#define XFS_ATTRSET_LOG_RES(mp, ext) \ 350#define XFS_ATTRSET_LOG_RES(mp, ext) \
731 ((mp)->m_reservations.tr_attrset + \ 351 ((mp)->m_reservations.tr_attrset + \
732 (ext * (mp)->m_sb.sb_sectsize) + \ 352 (ext * (mp)->m_sb.sb_sectsize) + \
733 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ 353 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
734 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) 354 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
735
736/*
737 * Removing an attribute.
738 * the inode: inode size
739 * the attribute btree could join: max depth * block size
740 * the inode bmap btree could join or split: max depth * block size
741 * And the bmap_finish transaction can free the attr blocks freed giving:
742 * the agf for the ag in which the blocks live: 2 * sector size
743 * the agfl for the ag in which the blocks live: 2 * sector size
744 * the superblock for the free block count: sector size
745 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
746 */
747#define XFS_CALC_ATTRRM_LOG_RES(mp) \
748 (MAX( \
749 ((mp)->m_sb.sb_inodesize + \
750 XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
751 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
752 (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
753 ((2 * (mp)->m_sb.sb_sectsize) + \
754 (2 * (mp)->m_sb.sb_sectsize) + \
755 (mp)->m_sb.sb_sectsize + \
756 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
757 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
758
759#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) 355#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
760
761/*
762 * Clearing a bad agino number in an agi hash bucket.
763 */
764#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
765 ((mp)->m_sb.sb_sectsize + 128)
766
767#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) 356#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
768 357
769 358
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be0ea38..a06bd62504fc 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -267,7 +267,7 @@ xfs_setattr(
267 if (code) { 267 if (code) {
268 ASSERT(tp == NULL); 268 ASSERT(tp == NULL);
269 lock_flags &= ~XFS_ILOCK_EXCL; 269 lock_flags &= ~XFS_ILOCK_EXCL;
270 ASSERT(lock_flags == XFS_IOLOCK_EXCL); 270 ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
271 goto error_return; 271 goto error_return;
272 } 272 }
273 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); 273 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);