aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJens Axboe <jaxboe@fusionio.com>2010-06-01 06:42:12 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-06-01 06:42:12 -0400
commitb4ca761577535b2b4d153689ee97342797dfff05 (patch)
tree29054d55508f1faa22ec32acf7c245751af03348 /fs
parent28f4197e5d4707311febeec8a0eb97cb5fd93c97 (diff)
parent67a3e12b05e055c0415c556a315a3d3eb637e29e (diff)
Merge branch 'master' into for-linus
Conflicts: fs/pipe.c Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_dir.c8
-rw-r--r--fs/9p/vfs_file.c17
-rw-r--r--fs/9p/vfs_inode.c107
-rw-r--r--fs/9p/vfs_super.c55
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c3
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/affs/namei.c2
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/write.c3
-rw-r--r--fs/aio.c71
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/attr.c50
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/dev-ioctl.c18
-rw-r--r--fs/autofs4/root.c22
-rw-r--r--fs/bad_inode.c3
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/block_dev.c16
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c109
-rw-r--r--fs/btrfs/ctree.h165
-rw-r--r--fs/btrfs/delayed-ref.c101
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c169
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2255
-rw-r--r--fs/btrfs/extent_io.c85
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/file-item.c28
-rw-r--r--fs/btrfs/file.c169
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1713
-rw-r--r--fs/btrfs/ioctl.c206
-rw-r--r--fs/btrfs/ordered-data.c82
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/relocation.c1971
-rw-r--r--fs/btrfs/root-tree.c23
-rw-r--r--fs/btrfs/super.c35
-rw-r--r--fs/btrfs/transaction.c232
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c241
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c17
-rw-r--r--fs/btrfs/xattr.c12
-rw-r--r--fs/buffer.c123
-rw-r--r--fs/ceph/addr.c11
-rw-r--r--fs/ceph/auth.c8
-rw-r--r--fs/ceph/auth.h8
-rw-r--r--fs/ceph/auth_none.c9
-rw-r--r--fs/ceph/auth_x.c31
-rw-r--r--fs/ceph/caps.c28
-rw-r--r--fs/ceph/ceph_fs.h83
-rw-r--r--fs/ceph/ceph_strings.c16
-rw-r--r--fs/ceph/debugfs.c13
-rw-r--r--fs/ceph/dir.c52
-rw-r--r--fs/ceph/export.c16
-rw-r--r--fs/ceph/file.c18
-rw-r--r--fs/ceph/inode.c99
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/mds_client.c402
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/messenger.c97
-rw-r--r--fs/ceph/messenger.h11
-rw-r--r--fs/ceph/mon_client.c262
-rw-r--r--fs/ceph/mon_client.h27
-rw-r--r--fs/ceph/msgpool.c180
-rw-r--r--fs/ceph/msgpool.h12
-rw-r--r--fs/ceph/msgr.h21
-rw-r--r--fs/ceph/osd_client.c105
-rw-r--r--fs/ceph/osdmap.c2
-rw-r--r--fs/ceph/pagelist.c2
-rw-r--r--fs/ceph/rados.h23
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c137
-rw-r--r--fs/ceph/super.h33
-rw-r--r--fs/ceph/xattr.c35
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/file.c4
-rw-r--r--fs/coda/coda_int.h3
-rw-r--r--fs/coda/file.c4
-rw-r--r--fs/coda/pioctl.c76
-rw-r--r--fs/coda/psdev.c5
-rw-r--r--fs/compat.c132
-rw-r--r--fs/configfs/inode.c9
-rw-r--r--fs/debugfs/file.c21
-rw-r--r--fs/direct-io.c123
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/exec.c202
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/file.c7
-rw-r--r--fs/exofs/inode.c30
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/file.c7
-rw-r--r--fs/ext2/inode.c153
-rw-r--r--fs/ext2/super.c20
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/fsync.c4
-rw-r--r--fs/ext3/super.c38
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c26
-rw-r--r--fs/ext4/ext4.h169
-rw-r--r--fs/ext4/ext4_jbd2.h8
-rw-r--r--fs/ext4/extents.c417
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c41
-rw-r--r--fs/ext4/ialloc.c89
-rw-r--r--fs/ext4/inode.c723
-rw-r--r--fs/ext4/ioctl.c27
-rw-r--r--fs/ext4/mballoc.c120
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c13
-rw-r--r--fs/ext4/namei.c61
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c117
-rw-r--r--fs/ext4/symlink.c2
-rw-r--r--fs/ext4/xattr.c39
-rw-r--r--fs/fat/cache.c13
-rw-r--r--fs/fat/dir.c28
-rw-r--r--fs/fat/fat.h22
-rw-r--r--fs/fat/file.c59
-rw-r--r--fs/fat/inode.c43
-rw-r--r--fs/fat/misc.c22
-rw-r--r--fs/file_table.c21
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/fs-writeback.c4
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fuse/dev.c528
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/fuse/file.c48
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/gfs2/acl.c4
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/file.c11
-rw-r--r--fs/gfs2/inode.c54
-rw-r--r--fs/gfs2/inode.h3
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/log.h29
-rw-r--r--fs/gfs2/ops_inode.c5
-rw-r--r--fs/gfs2/rgrp.c20
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h3
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c12
-rw-r--r--fs/hostfs/hostfs_kern.c4
-rw-r--r--fs/hpfs/file.c4
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/fs.c4
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/super.c16
-rw-r--r--fs/libfs.c108
-rw-r--r--fs/logfs/file.c4
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/minix/dir.c7
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/minix/itree_v2.c27
-rw-r--r--fs/namei.c2
-rw-r--r--fs/ncpfs/dir.c3
-rw-r--r--fs/ncpfs/file.c4
-rw-r--r--fs/ncpfs/ioctl.c27
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/write.c20
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nilfs2/file.c4
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/ntfs/dir.c5
-rw-r--r--fs/ntfs/file.c37
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/file.c15
-rw-r--r--fs/ocfs2/super.c50
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/partitions/ldm.c18
-rw-r--r--fs/pipe.c21
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c16
-rw-r--r--fs/proc/generic.c15
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/qnx4/dir.c3
-rw-r--r--fs/quota/dquot.c201
-rw-r--r--fs/quota/quota.c4
-rw-r--r--fs/ramfs/file-mmu.c3
-rw-r--r--fs/ramfs/file-nommu.c9
-rw-r--r--fs/read_write.c17
-rw-r--r--fs/reiserfs/dir.c9
-rw-r--r--fs/reiserfs/file.c5
-rw-r--r--fs/reiserfs/super.c48
-rw-r--r--fs/smbfs/dir.c3
-rw-r--r--fs/smbfs/file.c5
-rw-r--r--fs/smbfs/inode.c2
-rw-r--r--fs/smbfs/ioctl.c10
-rw-r--r--fs/smbfs/proto.h2
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/squashfs/Kconfig11
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/inode.c92
-rw-r--r--fs/squashfs/namei.c6
-rw-r--r--fs/squashfs/squashfs.h12
-rw-r--r--fs/squashfs/squashfs_fs.h76
-rw-r--r--fs/squashfs/squashfs_fs_i.h3
-rw-r--r--fs/squashfs/squashfs_fs_sb.h3
-rw-r--r--fs/squashfs/super.c30
-rw-r--r--fs/squashfs/symlink.c11
-rw-r--r--fs/squashfs/xattr.c323
-rw-r--r--fs/squashfs/xattr.h46
-rw-r--r--fs/squashfs/xattr_id.c100
-rw-r--r--fs/super.c17
-rw-r--r--fs/sync.c8
-rw-r--r--fs/sysfs/inode.c8
-rw-r--r--fs/sysv/dir.c2
-rw-r--r--fs/sysv/file.c2
-rw-r--r--fs/sysv/inode.c1
-rw-r--r--fs/ubifs/file.c17
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/udf/balloc.c43
-rw-r--r--fs/udf/dir.c3
-rw-r--r--fs/udf/file.c28
-rw-r--r--fs/udf/ialloc.c21
-rw-r--r--fs/udf/inode.c5
-rw-r--r--fs/udf/namei.c20
-rw-r--r--fs/udf/super.c13
-rw-r--r--fs/udf/udfdecl.h1
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c5
-rw-r--r--fs/ufs/ialloc.c13
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/namei.c16
-rw-r--r--fs/ufs/super.c112
-rw-r--r--fs/ufs/truncate.c20
-rw-r--r--fs/ufs/ufs_fs.h1
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h83
-rw-r--r--fs/xfs/quota/xfs_dquot.c6
-rw-r--r--fs/xfs/xfs_ag.h24
-rw-r--r--fs/xfs/xfs_alloc.c357
-rw-r--r--fs/xfs/xfs_alloc.h7
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_buf_item.c166
-rw-r--r--fs/xfs/xfs_buf_item.h18
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_log.c120
-rw-r--r--fs/xfs/xfs_log.h14
-rw-r--r--fs/xfs/xfs_log_cil.c725
-rw-r--r--fs/xfs/xfs_log_priv.h118
-rw-r--r--fs/xfs/xfs_log_recover.c46
-rw-r--r--fs/xfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_trans.c144
-rw-r--r--fs/xfs/xfs_trans.h44
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_item.c114
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_types.h2
275 files changed, 11458 insertions, 6190 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..32ef4009d030 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
40extern struct file_system_type v9fs_fs_type; 40extern struct file_system_type v9fs_fs_type;
41extern const struct address_space_operations v9fs_addr_operations; 41extern const struct address_space_operations v9fs_addr_operations;
42extern const struct file_operations v9fs_file_operations; 42extern const struct file_operations v9fs_file_operations;
43extern const struct file_operations v9fs_file_operations_dotl;
43extern const struct file_operations v9fs_dir_operations; 44extern const struct file_operations v9fs_dir_operations;
45extern const struct file_operations v9fs_dir_operations_dotl;
44extern const struct dentry_operations v9fs_dentry_operations; 46extern const struct dentry_operations v9fs_dentry_operations;
45extern const struct dentry_operations v9fs_cached_dentry_operations; 47extern const struct dentry_operations v9fs_cached_dentry_operations;
46 48
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0adfd64dfcee..d61e3b28ce37 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
203 .open = v9fs_file_open, 203 .open = v9fs_file_open,
204 .release = v9fs_dir_release, 204 .release = v9fs_dir_release,
205}; 205};
206
207const struct file_operations v9fs_dir_operations_dotl = {
208 .read = generic_read_dir,
209 .llseek = generic_file_llseek,
210 .readdir = v9fs_dir_readdir,
211 .open = v9fs_file_open,
212 .release = v9fs_dir_release,
213};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..2bedc6c94fc2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -257,15 +257,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
257 return total; 257 return total;
258} 258}
259 259
260static int v9fs_file_fsync(struct file *filp, struct dentry *dentry, 260static int v9fs_file_fsync(struct file *filp, int datasync)
261 int datasync)
262{ 261{
263 struct p9_fid *fid; 262 struct p9_fid *fid;
264 struct p9_wstat wstat; 263 struct p9_wstat wstat;
265 int retval; 264 int retval;
266 265
267 P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp, 266 P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
268 dentry, datasync);
269 267
270 fid = filp->private_data; 268 fid = filp->private_data;
271 v9fs_blank_wstat(&wstat); 269 v9fs_blank_wstat(&wstat);
@@ -296,3 +294,14 @@ const struct file_operations v9fs_file_operations = {
296 .mmap = generic_file_readonly_mmap, 294 .mmap = generic_file_readonly_mmap,
297 .fsync = v9fs_file_fsync, 295 .fsync = v9fs_file_fsync,
298}; 296};
297
298const struct file_operations v9fs_file_operations_dotl = {
299 .llseek = generic_file_llseek,
300 .read = v9fs_file_read,
301 .write = v9fs_file_write,
302 .open = v9fs_file_open,
303 .release = v9fs_dir_release,
304 .lock = v9fs_file_lock,
305 .mmap = generic_file_readonly_mmap,
306 .fsync = v9fs_file_fsync,
307};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 6d4d86187c55..4331b3b5ee1c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -44,9 +44,12 @@
44#include "cache.h" 44#include "cache.h"
45 45
46static const struct inode_operations v9fs_dir_inode_operations; 46static const struct inode_operations v9fs_dir_inode_operations;
47static const struct inode_operations v9fs_dir_inode_operations_ext; 47static const struct inode_operations v9fs_dir_inode_operations_dotu;
48static const struct inode_operations v9fs_dir_inode_operations_dotl;
48static const struct inode_operations v9fs_file_inode_operations; 49static const struct inode_operations v9fs_file_inode_operations;
50static const struct inode_operations v9fs_file_inode_operations_dotl;
49static const struct inode_operations v9fs_symlink_inode_operations; 51static const struct inode_operations v9fs_symlink_inode_operations;
52static const struct inode_operations v9fs_symlink_inode_operations_dotl;
50 53
51/** 54/**
52 * unixmode2p9mode - convert unix mode bits to plan 9 55 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -273,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
273 init_special_inode(inode, inode->i_mode, inode->i_rdev); 276 init_special_inode(inode, inode->i_mode, inode->i_rdev);
274 break; 277 break;
275 case S_IFREG: 278 case S_IFREG:
276 inode->i_op = &v9fs_file_inode_operations; 279 if (v9fs_proto_dotl(v9ses)) {
277 inode->i_fop = &v9fs_file_operations; 280 inode->i_op = &v9fs_file_inode_operations_dotl;
281 inode->i_fop = &v9fs_file_operations_dotl;
282 } else {
283 inode->i_op = &v9fs_file_inode_operations;
284 inode->i_fop = &v9fs_file_operations;
285 }
286
278 break; 287 break;
288
279 case S_IFLNK: 289 case S_IFLNK:
280 if (!v9fs_proto_dotu(v9ses)) { 290 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
281 P9_DPRINTK(P9_DEBUG_ERROR, 291 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
282 "extended modes used w/o 9P2000.u\n"); 292 "legacy protocol.\n");
283 err = -EINVAL; 293 err = -EINVAL;
284 goto error; 294 goto error;
285 } 295 }
286 inode->i_op = &v9fs_symlink_inode_operations; 296
297 if (v9fs_proto_dotl(v9ses))
298 inode->i_op = &v9fs_symlink_inode_operations_dotl;
299 else
300 inode->i_op = &v9fs_symlink_inode_operations;
301
287 break; 302 break;
288 case S_IFDIR: 303 case S_IFDIR:
289 inc_nlink(inode); 304 inc_nlink(inode);
290 if (v9fs_proto_dotu(v9ses)) 305 if (v9fs_proto_dotl(v9ses))
291 inode->i_op = &v9fs_dir_inode_operations_ext; 306 inode->i_op = &v9fs_dir_inode_operations_dotl;
307 else if (v9fs_proto_dotu(v9ses))
308 inode->i_op = &v9fs_dir_inode_operations_dotu;
292 else 309 else
293 inode->i_op = &v9fs_dir_inode_operations; 310 inode->i_op = &v9fs_dir_inode_operations;
294 inode->i_fop = &v9fs_dir_operations; 311
312 if (v9fs_proto_dotl(v9ses))
313 inode->i_fop = &v9fs_dir_operations_dotl;
314 else
315 inode->i_fop = &v9fs_dir_operations;
316
295 break; 317 break;
296 default: 318 default:
297 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", 319 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -432,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
432{ 454{
433 int retval; 455 int retval;
434 struct inode *file_inode; 456 struct inode *file_inode;
435 struct v9fs_session_info *v9ses;
436 struct p9_fid *v9fid; 457 struct p9_fid *v9fid;
437 458
438 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, 459 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
439 rmdir); 460 rmdir);
440 461
441 file_inode = file->d_inode; 462 file_inode = file->d_inode;
442 v9ses = v9fs_inode2v9ses(file_inode);
443 v9fid = v9fs_fid_clone(file); 463 v9fid = v9fs_fid_clone(file);
444 if (IS_ERR(v9fid)) 464 if (IS_ERR(v9fid))
445 return PTR_ERR(v9fid); 465 return PTR_ERR(v9fid);
@@ -482,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
482 ofid = NULL; 502 ofid = NULL;
483 fid = NULL; 503 fid = NULL;
484 name = (char *) dentry->d_name.name; 504 name = (char *) dentry->d_name.name;
485 dfid = v9fs_fid_clone(dentry->d_parent); 505 dfid = v9fs_fid_lookup(dentry->d_parent);
486 if (IS_ERR(dfid)) { 506 if (IS_ERR(dfid)) {
487 err = PTR_ERR(dfid); 507 err = PTR_ERR(dfid);
488 P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); 508 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
489 dfid = NULL; 509 return ERR_PTR(err);
490 goto error;
491 } 510 }
492 511
493 /* clone a fid to use for creation */ 512 /* clone a fid to use for creation */
@@ -495,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
495 if (IS_ERR(ofid)) { 514 if (IS_ERR(ofid)) {
496 err = PTR_ERR(ofid); 515 err = PTR_ERR(ofid);
497 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 516 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
498 ofid = NULL; 517 return ERR_PTR(err);
499 goto error;
500 } 518 }
501 519
502 err = p9_client_fcreate(ofid, name, perm, mode, extension); 520 err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -506,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
506 } 524 }
507 525
508 /* now walk from the parent so we can get unopened fid */ 526 /* now walk from the parent so we can get unopened fid */
509 fid = p9_client_walk(dfid, 1, &name, 0); 527 fid = p9_client_walk(dfid, 1, &name, 1);
510 if (IS_ERR(fid)) { 528 if (IS_ERR(fid)) {
511 err = PTR_ERR(fid); 529 err = PTR_ERR(fid);
512 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 530 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
513 fid = NULL; 531 fid = NULL;
514 goto error; 532 goto error;
515 } else 533 }
516 dfid = NULL;
517 534
518 /* instantiate inode and assign the unopened fid to the dentry */ 535 /* instantiate inode and assign the unopened fid to the dentry */
519 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 536 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -536,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
536 return ofid; 553 return ofid;
537 554
538error: 555error:
539 if (dfid)
540 p9_client_clunk(dfid);
541
542 if (ofid) 556 if (ofid)
543 p9_client_clunk(ofid); 557 p9_client_clunk(ofid);
544 558
@@ -673,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
673 if (IS_ERR(fid)) { 687 if (IS_ERR(fid)) {
674 result = PTR_ERR(fid); 688 result = PTR_ERR(fid);
675 if (result == -ENOENT) { 689 if (result == -ENOENT) {
676 d_add(dentry, NULL); 690 inode = NULL;
677 return NULL; 691 goto inst_out;
678 } 692 }
679 693
680 return ERR_PTR(result); 694 return ERR_PTR(result);
@@ -691,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
691 if (result < 0) 705 if (result < 0)
692 goto error; 706 goto error;
693 707
694 if ((fid->qid.version) && (v9ses->cache)) 708inst_out:
709 if (v9ses->cache)
695 dentry->d_op = &v9fs_cached_dentry_operations; 710 dentry->d_op = &v9fs_cached_dentry_operations;
696 else 711 else
697 dentry->d_op = &v9fs_dentry_operations; 712 dentry->d_op = &v9fs_dentry_operations;
@@ -770,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
770 goto clunk_olddir; 785 goto clunk_olddir;
771 } 786 }
772 787
788 if (v9fs_proto_dotl(v9ses)) {
789 retval = p9_client_rename(oldfid, newdirfid,
790 (char *) new_dentry->d_name.name);
791 if (retval != -ENOSYS)
792 goto clunk_newdir;
793 }
794
773 /* 9P can only handle file rename in the same directory */ 795 /* 9P can only handle file rename in the same directory */
774 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { 796 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
775 P9_DPRINTK(P9_DEBUG_ERROR, 797 P9_DPRINTK(P9_DEBUG_ERROR,
@@ -1195,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1195 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); 1217 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
1196 else if (S_ISFIFO(mode)) 1218 else if (S_ISFIFO(mode))
1197 *name = 0; 1219 *name = 0;
1220 else if (S_ISSOCK(mode))
1221 *name = 0;
1198 else { 1222 else {
1199 __putname(name); 1223 __putname(name);
1200 return -EINVAL; 1224 return -EINVAL;
@@ -1206,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1206 return retval; 1230 return retval;
1207} 1231}
1208 1232
1209static const struct inode_operations v9fs_dir_inode_operations_ext = { 1233static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1234 .create = v9fs_vfs_create,
1235 .lookup = v9fs_vfs_lookup,
1236 .symlink = v9fs_vfs_symlink,
1237 .link = v9fs_vfs_link,
1238 .unlink = v9fs_vfs_unlink,
1239 .mkdir = v9fs_vfs_mkdir,
1240 .rmdir = v9fs_vfs_rmdir,
1241 .mknod = v9fs_vfs_mknod,
1242 .rename = v9fs_vfs_rename,
1243 .getattr = v9fs_vfs_getattr,
1244 .setattr = v9fs_vfs_setattr,
1245};
1246
1247static const struct inode_operations v9fs_dir_inode_operations_dotl = {
1210 .create = v9fs_vfs_create, 1248 .create = v9fs_vfs_create,
1211 .lookup = v9fs_vfs_lookup, 1249 .lookup = v9fs_vfs_lookup,
1212 .symlink = v9fs_vfs_symlink, 1250 .symlink = v9fs_vfs_symlink,
@@ -1237,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
1237 .setattr = v9fs_vfs_setattr, 1275 .setattr = v9fs_vfs_setattr,
1238}; 1276};
1239 1277
1278static const struct inode_operations v9fs_file_inode_operations_dotl = {
1279 .getattr = v9fs_vfs_getattr,
1280 .setattr = v9fs_vfs_setattr,
1281};
1282
1240static const struct inode_operations v9fs_symlink_inode_operations = { 1283static const struct inode_operations v9fs_symlink_inode_operations = {
1241 .readlink = generic_readlink, 1284 .readlink = generic_readlink,
1242 .follow_link = v9fs_vfs_follow_link, 1285 .follow_link = v9fs_vfs_follow_link,
@@ -1244,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
1244 .getattr = v9fs_vfs_getattr, 1287 .getattr = v9fs_vfs_getattr,
1245 .setattr = v9fs_vfs_setattr, 1288 .setattr = v9fs_vfs_setattr,
1246}; 1289};
1290
1291static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
1292 .readlink = generic_readlink,
1293 .follow_link = v9fs_vfs_follow_link,
1294 .put_link = v9fs_vfs_put_link,
1295 .getattr = v9fs_vfs_getattr,
1296 .setattr = v9fs_vfs_setattr,
1297};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 806da5d3b3a0..be74d020436e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -38,6 +38,7 @@
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/statfs.h>
41#include <net/9p/9p.h> 42#include <net/9p/9p.h>
42#include <net/9p/client.h> 43#include <net/9p/client.h>
43 44
@@ -45,7 +46,7 @@
45#include "v9fs_vfs.h" 46#include "v9fs_vfs.h"
46#include "fid.h" 47#include "fid.h"
47 48
48static const struct super_operations v9fs_super_ops; 49static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
49 50
50/** 51/**
51 * v9fs_set_super - set the superblock 52 * v9fs_set_super - set the superblock
@@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
76 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 77 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
77 sb->s_blocksize = 1 << sb->s_blocksize_bits; 78 sb->s_blocksize = 1 << sb->s_blocksize_bits;
78 sb->s_magic = V9FS_MAGIC; 79 sb->s_magic = V9FS_MAGIC;
79 sb->s_op = &v9fs_super_ops; 80 if (v9fs_proto_dotl(v9ses))
81 sb->s_op = &v9fs_super_ops_dotl;
82 else
83 sb->s_op = &v9fs_super_ops;
80 sb->s_bdi = &v9ses->bdi; 84 sb->s_bdi = &v9ses->bdi;
81 85
82 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 86 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
@@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb)
211 v9fs_session_begin_cancel(v9ses); 215 v9fs_session_begin_cancel(v9ses);
212} 216}
213 217
218static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
219{
220 struct v9fs_session_info *v9ses;
221 struct p9_fid *fid;
222 struct p9_rstatfs rs;
223 int res;
224
225 fid = v9fs_fid_lookup(dentry);
226 if (IS_ERR(fid)) {
227 res = PTR_ERR(fid);
228 goto done;
229 }
230
231 v9ses = v9fs_inode2v9ses(dentry->d_inode);
232 if (v9fs_proto_dotl(v9ses)) {
233 res = p9_client_statfs(fid, &rs);
234 if (res == 0) {
235 buf->f_type = rs.type;
236 buf->f_bsize = rs.bsize;
237 buf->f_blocks = rs.blocks;
238 buf->f_bfree = rs.bfree;
239 buf->f_bavail = rs.bavail;
240 buf->f_files = rs.files;
241 buf->f_ffree = rs.ffree;
242 buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
243 buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
244 buf->f_namelen = rs.namelen;
245 }
246 if (res != -ENOSYS)
247 goto done;
248 }
249 res = simple_statfs(dentry, buf);
250done:
251 return res;
252}
253
214static const struct super_operations v9fs_super_ops = { 254static const struct super_operations v9fs_super_ops = {
215#ifdef CONFIG_9P_FSCACHE 255#ifdef CONFIG_9P_FSCACHE
216 .alloc_inode = v9fs_alloc_inode, 256 .alloc_inode = v9fs_alloc_inode,
@@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
222 .umount_begin = v9fs_umount_begin, 262 .umount_begin = v9fs_umount_begin,
223}; 263};
224 264
265static const struct super_operations v9fs_super_ops_dotl = {
266#ifdef CONFIG_9P_FSCACHE
267 .alloc_inode = v9fs_alloc_inode,
268 .destroy_inode = v9fs_destroy_inode,
269#endif
270 .statfs = v9fs_statfs,
271 .clear_inode = v9fs_clear_inode,
272 .show_options = generic_show_options,
273 .umount_begin = v9fs_umount_begin,
274};
275
225struct file_system_type v9fs_fs_type = { 276struct file_system_type v9fs_fs_type = {
226 .name = "9p", 277 .name = "9p",
227 .get_sb = v9fs_get_sb, 278 .get_sb = v9fs_get_sb,
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
197 .read = generic_read_dir, 197 .read = generic_read_dir,
198 .llseek = generic_file_llseek, 198 .llseek = generic_file_llseek,
199 .readdir = adfs_readdir, 199 .readdir = adfs_readdir,
200 .fsync = simple_fsync, 200 .fsync = generic_file_fsync,
201}; 201};
202 202
203static int 203static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
26 .read = do_sync_read, 26 .read = do_sync_read,
27 .aio_read = generic_file_aio_read, 27 .aio_read = generic_file_aio_read,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .write = do_sync_write, 30 .write = do_sync_write,
31 .aio_write = generic_file_aio_write, 31 .aio_write = generic_file_aio_write,
32 .splice_read = generic_file_splice_read, 32 .splice_read = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..6f850b06ab62 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
322 if (error) 322 if (error)
323 goto out; 323 goto out;
324 324
325 /* XXX: this is missing some actual on-disk truncation.. */
325 if (ia_valid & ATTR_SIZE) 326 if (ia_valid & ATTR_SIZE)
326 error = vmtruncate(inode, attr->ia_size); 327 error = simple_setsize(inode, attr->ia_size);
327 328
328 if (error) 329 if (error)
329 goto out; 330 goto out;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..f05b6155ccc8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -183,7 +183,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent
183 183
184void affs_free_prealloc(struct inode *inode); 184void affs_free_prealloc(struct inode *inode);
185extern void affs_truncate(struct inode *); 185extern void affs_truncate(struct inode *);
186int affs_file_fsync(struct file *, struct dentry *, int); 186int affs_file_fsync(struct file *, int);
187 187
188/* dir.c */ 188/* dir.c */
189 189
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..322710c3eedf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
916 affs_free_prealloc(inode); 916 affs_free_prealloc(inode);
917} 917}
918 918
919int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 919int affs_file_fsync(struct file *filp, int datasync)
920{ 920{
921 struct inode * inode = dentry->d_inode; 921 struct inode *inode = filp->f_mapping->host;
922 int ret, err; 922 int ret, err;
923 923
924 ret = write_inode_now(inode, 0); 924 ret = write_inode_now(inode, 0);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
224 affs_brelse(bh); 224 affs_brelse(bh);
225 inode = affs_iget(sb, ino); 225 inode = affs_iget(sb, ino);
226 if (IS_ERR(inode)) 226 if (IS_ERR(inode))
227 return ERR_PTR(PTR_ERR(inode)); 227 return ERR_CAST(inode);
228 } 228 }
229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations; 229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
230 d_add(dentry, inode); 230 d_add(dentry, inode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 807f284cc75e..5f679b77ce24 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -740,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
740extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 740extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
741 unsigned long, loff_t); 741 unsigned long, loff_t);
742extern int afs_writeback_all(struct afs_vnode *); 742extern int afs_writeback_all(struct afs_vnode *);
743extern int afs_fsync(struct file *, struct dentry *, int); 743extern int afs_fsync(struct file *, int);
744 744
745 745
746/*****************************************************************************/ 746/*****************************************************************************/
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..3dab9e9948d0 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -701,8 +701,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
701 * - the return status from this call provides a reliable indication of 701 * - the return status from this call provides a reliable indication of
702 * whether any write errors occurred for this process. 702 * whether any write errors occurred for this process.
703 */ 703 */
704int afs_fsync(struct file *file, struct dentry *dentry, int datasync) 704int afs_fsync(struct file *file, int datasync)
705{ 705{
706 struct dentry *dentry = file->f_path.dentry;
706 struct afs_writeback *wb, *xwb; 707 struct afs_writeback *wb, *xwb;
707 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 708 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
708 int ret; 709 int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..1ccf25cef1f0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/mempool.h> 37#include <linux/mempool.h>
38#include <linux/hash.h> 38#include <linux/hash.h>
39#include <linux/compat.h>
39 40
40#include <asm/kmap_types.h> 41#include <asm/kmap_types.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
526 527
527 /* Complete the fput(s) */ 528 /* Complete the fput(s) */
528 if (req->ki_filp != NULL) 529 if (req->ki_filp != NULL)
529 __fput(req->ki_filp); 530 fput(req->ki_filp);
530 531
531 /* Link the iocb into the context's free list */ 532 /* Link the iocb into the context's free list */
532 spin_lock_irq(&ctx->ctx_lock); 533 spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
559 560
560 /* 561 /*
561 * Try to optimize the aio and eventfd file* puts, by avoiding to 562 * Try to optimize the aio and eventfd file* puts, by avoiding to
562 * schedule work in case it is not __fput() time. In normal cases, 563 * schedule work in case it is not final fput() time. In normal cases,
563 * we would not be holding the last reference to the file*, so 564 * we would not be holding the last reference to the file*, so
564 * this function will be executed w/out any aio kthread wakeup. 565 * this function will be executed w/out any aio kthread wakeup.
565 */ 566 */
566 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { 567 if (unlikely(!fput_atomic(req->ki_filp))) {
567 get_ioctx(ctx); 568 get_ioctx(ctx);
568 spin_lock(&fput_lock); 569 spin_lock(&fput_lock);
569 list_add(&req->ki_list, &fput_head); 570 list_add(&req->ki_list, &fput_head);
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
1384 return ret; 1385 return ret;
1385} 1386}
1386 1387
1387static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) 1388static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
1388{ 1389{
1389 ssize_t ret; 1390 ssize_t ret;
1390 1391
1391 ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, 1392#ifdef CONFIG_COMPAT
1392 kiocb->ki_nbytes, 1, 1393 if (compat)
1393 &kiocb->ki_inline_vec, &kiocb->ki_iovec); 1394 ret = compat_rw_copy_check_uvector(type,
1395 (struct compat_iovec __user *)kiocb->ki_buf,
1396 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1397 &kiocb->ki_iovec);
1398 else
1399#endif
1400 ret = rw_copy_check_uvector(type,
1401 (struct iovec __user *)kiocb->ki_buf,
1402 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1403 &kiocb->ki_iovec);
1394 if (ret < 0) 1404 if (ret < 0)
1395 goto out; 1405 goto out;
1396 1406
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
1420 * Performs the initial checks and aio retry method 1430 * Performs the initial checks and aio retry method
1421 * setup for the kiocb at the time of io submission. 1431 * setup for the kiocb at the time of io submission.
1422 */ 1432 */
1423static ssize_t aio_setup_iocb(struct kiocb *kiocb) 1433static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1424{ 1434{
1425 struct file *file = kiocb->ki_filp; 1435 struct file *file = kiocb->ki_filp;
1426 ssize_t ret = 0; 1436 ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1469 ret = security_file_permission(file, MAY_READ); 1479 ret = security_file_permission(file, MAY_READ);
1470 if (unlikely(ret)) 1480 if (unlikely(ret))
1471 break; 1481 break;
1472 ret = aio_setup_vectored_rw(READ, kiocb); 1482 ret = aio_setup_vectored_rw(READ, kiocb, compat);
1473 if (ret) 1483 if (ret)
1474 break; 1484 break;
1475 ret = -EINVAL; 1485 ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1483 ret = security_file_permission(file, MAY_WRITE); 1493 ret = security_file_permission(file, MAY_WRITE);
1484 if (unlikely(ret)) 1494 if (unlikely(ret))
1485 break; 1495 break;
1486 ret = aio_setup_vectored_rw(WRITE, kiocb); 1496 ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
1487 if (ret) 1497 if (ret)
1488 break; 1498 break;
1489 ret = -EINVAL; 1499 ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
1548} 1558}
1549 1559
1550static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1560static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1551 struct iocb *iocb, struct hlist_head *batch_hash) 1561 struct iocb *iocb, struct hlist_head *batch_hash,
1562 bool compat)
1552{ 1563{
1553 struct kiocb *req; 1564 struct kiocb *req;
1554 struct file *file; 1565 struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1609 req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1620 req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
1610 req->ki_opcode = iocb->aio_lio_opcode; 1621 req->ki_opcode = iocb->aio_lio_opcode;
1611 1622
1612 ret = aio_setup_iocb(req); 1623 ret = aio_setup_iocb(req, compat);
1613 1624
1614 if (ret) 1625 if (ret)
1615 goto out_put_req; 1626 goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
1637 return ret; 1648 return ret;
1638} 1649}
1639 1650
1640/* sys_io_submit: 1651long do_io_submit(aio_context_t ctx_id, long nr,
1641 * Queue the nr iocbs pointed to by iocbpp for processing. Returns 1652 struct iocb __user *__user *iocbpp, bool compat)
1642 * the number of iocbs queued. May return -EINVAL if the aio_context
1643 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1644 * *iocbpp[0] is not properly initialized, if the operation specified
1645 * is invalid for the file descriptor in the iocb. May fail with
1646 * -EFAULT if any of the data structures point to invalid data. May
1647 * fail with -EBADF if the file descriptor specified in the first
1648 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1649 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1650 * fail with -ENOSYS if not implemented.
1651 */
1652SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1653 struct iocb __user * __user *, iocbpp)
1654{ 1653{
1655 struct kioctx *ctx; 1654 struct kioctx *ctx;
1656 long ret = 0; 1655 long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1687 break; 1686 break;
1688 } 1687 }
1689 1688
1690 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); 1689 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
1691 if (ret) 1690 if (ret)
1692 break; 1691 break;
1693 } 1692 }
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1697 return i ? i : ret; 1696 return i ? i : ret;
1698} 1697}
1699 1698
1699/* sys_io_submit:
1700 * Queue the nr iocbs pointed to by iocbpp for processing. Returns
1701 * the number of iocbs queued. May return -EINVAL if the aio_context
1702 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1703 * *iocbpp[0] is not properly initialized, if the operation specified
1704 * is invalid for the file descriptor in the iocb. May fail with
1705 * -EFAULT if any of the data structures point to invalid data. May
1706 * fail with -EBADF if the file descriptor specified in the first
1707 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1708 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1709 * fail with -ENOSYS if not implemented.
1710 */
1711SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1712 struct iocb __user * __user *, iocbpp)
1713{
1714 return do_io_submit(ctx_id, nr, iocbpp, 0);
1715}
1716
1700/* lookup_kiocb 1717/* lookup_kiocb
1701 * Finds a given iocb for cancellation. 1718 * Finds a given iocb for cancellation.
1702 */ 1719 */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9bd4b3876c99..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void)
205 * that it already _is_ on the dirty list. 205 * that it already _is_ on the dirty list.
206 */ 206 */
207 inode->i_state = I_DIRTY; 207 inode->i_state = I_DIRTY;
208 inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; 208 inode->i_mode = S_IRUSR | S_IWUSR;
209 inode->i_uid = current_fsuid(); 209 inode->i_uid = current_fsuid();
210 inode->i_gid = current_fsgid(); 210 inode->i_gid = current_fsgid();
211 inode->i_flags |= S_PRIVATE; 211 inode->i_flags |= S_PRIVATE;
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
67 * @offset: the new size to assign to the inode 67 * @offset: the new size to assign to the inode
68 * @Returns: 0 on success, -ve errno on failure 68 * @Returns: 0 on success, -ve errno on failure
69 * 69 *
70 * inode_newsize_ok must be called with i_mutex held.
71 *
70 * inode_newsize_ok will check filesystem limits and ulimits to check that the 72 * inode_newsize_ok will check filesystem limits and ulimits to check that the
71 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ 73 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
72 * when necessary. Caller must not proceed with inode size change if failure is 74 * when necessary. Caller must not proceed with inode size change if failure is
73 * returned. @inode must be a file (not directory), with appropriate 75 * returned. @inode must be a file (not directory), with appropriate
74 * permissions to allow truncate (inode_newsize_ok does NOT check these 76 * permissions to allow truncate (inode_newsize_ok does NOT check these
75 * conditions). 77 * conditions).
76 *
77 * inode_newsize_ok must be called with i_mutex held.
78 */ 78 */
79int inode_newsize_ok(const struct inode *inode, loff_t offset) 79int inode_newsize_ok(const struct inode *inode, loff_t offset)
80{ 80{
@@ -104,17 +104,25 @@ out_big:
104} 104}
105EXPORT_SYMBOL(inode_newsize_ok); 105EXPORT_SYMBOL(inode_newsize_ok);
106 106
107int inode_setattr(struct inode * inode, struct iattr * attr) 107/**
108 * generic_setattr - copy simple metadata updates into the generic inode
109 * @inode: the inode to be updated
110 * @attr: the new attributes
111 *
112 * generic_setattr must be called with i_mutex held.
113 *
114 * generic_setattr updates the inode's metadata with that specified
115 * in attr. Noticably missing is inode size update, which is more complex
116 * as it requires pagecache updates. See simple_setsize.
117 *
118 * The inode is not marked as dirty after this operation. The rationale is
119 * that for "simple" filesystems, the struct inode is the inode storage.
120 * The caller is free to mark the inode dirty afterwards if needed.
121 */
122void generic_setattr(struct inode *inode, const struct iattr *attr)
108{ 123{
109 unsigned int ia_valid = attr->ia_valid; 124 unsigned int ia_valid = attr->ia_valid;
110 125
111 if (ia_valid & ATTR_SIZE &&
112 attr->ia_size != i_size_read(inode)) {
113 int error = vmtruncate(inode, attr->ia_size);
114 if (error)
115 return error;
116 }
117
118 if (ia_valid & ATTR_UID) 126 if (ia_valid & ATTR_UID)
119 inode->i_uid = attr->ia_uid; 127 inode->i_uid = attr->ia_uid;
120 if (ia_valid & ATTR_GID) 128 if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
135 mode &= ~S_ISGID; 143 mode &= ~S_ISGID;
136 inode->i_mode = mode; 144 inode->i_mode = mode;
137 } 145 }
146}
147EXPORT_SYMBOL(generic_setattr);
148
149/*
150 * note this function is deprecated, the new truncate sequence should be
151 * used instead -- see eg. simple_setsize, generic_setattr.
152 */
153int inode_setattr(struct inode *inode, const struct iattr *attr)
154{
155 unsigned int ia_valid = attr->ia_valid;
156
157 if (ia_valid & ATTR_SIZE &&
158 attr->ia_size != i_size_read(inode)) {
159 int error;
160
161 error = vmtruncate(inode, attr->ia_size);
162 if (error)
163 return error;
164 }
165
166 generic_setattr(inode, attr);
167
138 mark_inode_dirty(inode); 168 mark_inode_dirty(inode);
139 169
140 return 0; 170 return 0;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
29 29
30const struct file_operations autofs_root_operations = { 30const struct file_operations autofs_root_operations = {
31 .llseek = generic_file_llseek,
31 .read = generic_read_dir, 32 .read = generic_read_dir,
32 .readdir = autofs_root_readdir, 33 .readdir = autofs_root_readdir,
33 .ioctl = autofs_root_ioctl, 34 .ioctl = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d29b7f6df862..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
95 */ 95 */
96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) 96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
97{ 97{
98 struct autofs_dev_ioctl tmp, *ads; 98 struct autofs_dev_ioctl tmp;
99 99
100 if (copy_from_user(&tmp, in, sizeof(tmp))) 100 if (copy_from_user(&tmp, in, sizeof(tmp)))
101 return ERR_PTR(-EFAULT); 101 return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
103 if (tmp.size < sizeof(tmp)) 103 if (tmp.size < sizeof(tmp))
104 return ERR_PTR(-EINVAL); 104 return ERR_PTR(-EINVAL);
105 105
106 ads = kmalloc(tmp.size, GFP_KERNEL); 106 return memdup_user(in, tmp.size);
107 if (!ads)
108 return ERR_PTR(-ENOMEM);
109
110 if (copy_from_user(ads, in, tmp.size)) {
111 kfree(ads);
112 return ERR_PTR(-EFAULT);
113 }
114
115 return ads;
116} 107}
117 108
118static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) 109static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -736,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
736}; 727};
737 728
738static struct miscdevice _autofs_dev_ioctl_misc = { 729static struct miscdevice _autofs_dev_ioctl_misc = {
739 .minor = MISC_DYNAMIC_MINOR, 730 .minor = AUTOFS_MINOR,
740 .name = AUTOFS_DEVICE_NAME, 731 .name = AUTOFS_DEVICE_NAME,
741 .fops = &_dev_ioctl_fops 732 .fops = &_dev_ioctl_fops
742}; 733};
743 734
735MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
736MODULE_ALIAS("devname:autofs");
737
744/* Register/deregister misc character device */ 738/* Register/deregister misc character device */
745int autofs_dev_ioctl_init(void) 739int autofs_dev_ioctl_init(void)
746{ 740{
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e8e5e63ac950..db4117ed7803 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,14 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/param.h> 19#include <linux/param.h>
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/smp_lock.h>
21#include "autofs_i.h" 22#include "autofs_i.h"
22 23
23static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 24static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
24static int autofs4_dir_unlink(struct inode *,struct dentry *); 25static int autofs4_dir_unlink(struct inode *,struct dentry *);
25static int autofs4_dir_rmdir(struct inode *,struct dentry *); 26static int autofs4_dir_rmdir(struct inode *,struct dentry *);
26static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 27static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
27static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
28static int autofs4_dir_open(struct inode *inode, struct file *file); 29static int autofs4_dir_open(struct inode *inode, struct file *file);
29static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 30static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
30static void *autofs4_follow_link(struct dentry *, struct nameidata *); 31static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
38 .read = generic_read_dir, 39 .read = generic_read_dir,
39 .readdir = dcache_readdir, 40 .readdir = dcache_readdir,
40 .llseek = dcache_dir_lseek, 41 .llseek = dcache_dir_lseek,
41 .ioctl = autofs4_root_ioctl, 42 .unlocked_ioctl = autofs4_root_ioctl,
42}; 43};
43 44
44const struct file_operations autofs4_dir_operations = { 45const struct file_operations autofs4_dir_operations = {
@@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
902 * ioctl()'s on the root directory is the chief method for the daemon to 903 * ioctl()'s on the root directory is the chief method for the daemon to
903 * generate kernel reactions 904 * generate kernel reactions
904 */ 905 */
905static int autofs4_root_ioctl(struct inode *inode, struct file *filp, 906static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
906 unsigned int cmd, unsigned long arg) 907 unsigned int cmd, unsigned long arg)
907{ 908{
908 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); 909 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
909 void __user *p = (void __user *)arg; 910 void __user *p = (void __user *)arg;
@@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
947 return -ENOSYS; 948 return -ENOSYS;
948 } 949 }
949} 950}
951
952static long autofs4_root_ioctl(struct file *filp,
953 unsigned int cmd, unsigned long arg)
954{
955 long ret;
956 struct inode *inode = filp->f_dentry->d_inode;
957
958 lock_kernel();
959 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
960 unlock_kernel();
961
962 return ret;
963}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..52e59bf4aa5f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
93 return -EIO; 93 return -EIO;
94} 94}
95 95
96static int bad_file_fsync(struct file *file, struct dentry *dentry, 96static int bad_file_fsync(struct file *file, int datasync)
97 int datasync)
98{ 97{
99 return -EIO; 98 return -EIO;
100} 99}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 8f73841fc974..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
78const struct file_operations bfs_dir_operations = { 78const struct file_operations bfs_dir_operations = {
79 .read = generic_read_dir, 79 .read = generic_read_dir,
80 .readdir = bfs_readdir, 80 .readdir = bfs_readdir,
81 .fsync = simple_fsync, 81 .fsync = generic_file_fsync,
82 .llseek = generic_file_llseek, 82 .llseek = generic_file_llseek,
83}; 83};
84 84
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 26e5f5026620..7346c96308a5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
172 struct file *file = iocb->ki_filp; 172 struct file *file = iocb->ki_filp;
173 struct inode *inode = file->f_mapping->host; 173 struct inode *inode = file->f_mapping->host;
174 174
175 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 175 return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
176 iov, offset, nr_segs, blkdev_get_blocks, NULL); 176 I_BDEV(inode), iov, offset, nr_segs,
177 blkdev_get_blocks, NULL);
177} 178}
178 179
179int __sync_blockdev(struct block_device *bdev, int wait) 180int __sync_blockdev(struct block_device *bdev, int wait)
@@ -309,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
309 struct page **pagep, void **fsdata) 310 struct page **pagep, void **fsdata)
310{ 311{
311 *pagep = NULL; 312 *pagep = NULL;
312 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 313 return block_write_begin_newtrunc(file, mapping, pos, len, flags,
313 blkdev_get_block); 314 pagep, fsdata, blkdev_get_block);
314} 315}
315 316
316static int blkdev_write_end(struct file *file, struct address_space *mapping, 317static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -358,12 +359,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
358 return retval; 359 return retval;
359} 360}
360 361
361/* 362int blkdev_fsync(struct file *filp, int datasync)
362 * Filp is never NULL; the only case when ->fsync() is called with
363 * NULL first argument is nfsd_sync_dir() and that's not a directory.
364 */
365
366int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
367{ 363{
368 struct inode *bd_inode = filp->f_mapping->host; 364 struct inode *bd_inode = filp->f_mapping->host;
369 struct block_device *bdev = I_BDEV(bd_inode); 365 struct block_device *bdev = I_BDEV(bd_inode);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
377 if (!list_empty(&worker->pending) || 377 if (!list_empty(&worker->pending) ||
378 !list_empty(&worker->prio_pending)) { 378 !list_empty(&worker->prio_pending)) {
379 spin_unlock_irq(&worker->lock); 379 spin_unlock_irq(&worker->lock);
380 set_current_state(TASK_RUNNING);
380 goto again; 381 goto again;
381 } 382 }
382 383
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock; 139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents;
140 int reserved_extents; 141 int reserved_extents;
141 int outstanding_extents;
142 142
143 /* 143 /*
144 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
151 * of these. 151 * of these.
152 */ 152 */
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1;
154 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
155 156
156 /* 157 /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..0d1d966b0fe4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
281 struct btrfs_root *root, 281 struct btrfs_root *root,
282 struct extent_buffer *buf, 282 struct extent_buffer *buf,
283 struct extent_buffer *cow) 283 struct extent_buffer *cow,
284 int *last_ref)
284{ 285{
285 u64 refs; 286 u64 refs;
286 u64 owner; 287 u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
366 BUG_ON(ret); 367 BUG_ON(ret);
367 } 368 }
368 clean_tree_block(trans, root, buf); 369 clean_tree_block(trans, root, buf);
370 *last_ref = 1;
369 } 371 }
370 return 0; 372 return 0;
371} 373}
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
392 struct btrfs_disk_key disk_key; 394 struct btrfs_disk_key disk_key;
393 struct extent_buffer *cow; 395 struct extent_buffer *cow;
394 int level; 396 int level;
397 int last_ref = 0;
395 int unlock_orig = 0; 398 int unlock_orig = 0;
396 u64 parent_start; 399 u64 parent_start;
397 400
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
442 (unsigned long)btrfs_header_fsid(cow), 445 (unsigned long)btrfs_header_fsid(cow),
443 BTRFS_FSID_SIZE); 446 BTRFS_FSID_SIZE);
444 447
445 update_ref_for_cow(trans, root, buf, cow); 448 update_ref_for_cow(trans, root, buf, cow, &last_ref);
449
450 if (root->ref_cows)
451 btrfs_reloc_cow_block(trans, root, buf, cow);
446 452
447 if (buf == root->node) { 453 if (buf == root->node) {
448 WARN_ON(parent && parent != buf); 454 WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
457 extent_buffer_get(cow); 463 extent_buffer_get(cow);
458 spin_unlock(&root->node_lock); 464 spin_unlock(&root->node_lock);
459 465
460 btrfs_free_tree_block(trans, root, buf->start, buf->len, 466 btrfs_free_tree_block(trans, root, buf, parent_start,
461 parent_start, root->root_key.objectid, level); 467 last_ref);
462 free_extent_buffer(buf); 468 free_extent_buffer(buf);
463 add_root_to_dirty_list(root); 469 add_root_to_dirty_list(root);
464 } else { 470 } else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
473 btrfs_set_node_ptr_generation(parent, parent_slot, 479 btrfs_set_node_ptr_generation(parent, parent_slot,
474 trans->transid); 480 trans->transid);
475 btrfs_mark_buffer_dirty(parent); 481 btrfs_mark_buffer_dirty(parent);
476 btrfs_free_tree_block(trans, root, buf->start, buf->len, 482 btrfs_free_tree_block(trans, root, buf, parent_start,
477 parent_start, root->root_key.objectid, level); 483 last_ref);
478 } 484 }
479 if (unlock_orig) 485 if (unlock_orig)
480 btrfs_tree_unlock(buf); 486 btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
949 return bin_search(eb, key, level, slot); 955 return bin_search(eb, key, level, slot);
950} 956}
951 957
958static void root_add_used(struct btrfs_root *root, u32 size)
959{
960 spin_lock(&root->accounting_lock);
961 btrfs_set_root_used(&root->root_item,
962 btrfs_root_used(&root->root_item) + size);
963 spin_unlock(&root->accounting_lock);
964}
965
966static void root_sub_used(struct btrfs_root *root, u32 size)
967{
968 spin_lock(&root->accounting_lock);
969 btrfs_set_root_used(&root->root_item,
970 btrfs_root_used(&root->root_item) - size);
971 spin_unlock(&root->accounting_lock);
972}
973
952/* given a node and slot number, this reads the blocks it points to. The 974/* given a node and slot number, this reads the blocks it points to. The
953 * extent buffer is returned with a reference taken (but unlocked). 975 * extent buffer is returned with a reference taken (but unlocked).
954 * NULL is returned on error. 976 * NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1019 btrfs_tree_lock(child); 1041 btrfs_tree_lock(child);
1020 btrfs_set_lock_blocking(child); 1042 btrfs_set_lock_blocking(child);
1021 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1043 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1022 BUG_ON(ret); 1044 if (ret) {
1045 btrfs_tree_unlock(child);
1046 free_extent_buffer(child);
1047 goto enospc;
1048 }
1023 1049
1024 spin_lock(&root->node_lock); 1050 spin_lock(&root->node_lock);
1025 root->node = child; 1051 root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1034 btrfs_tree_unlock(mid); 1060 btrfs_tree_unlock(mid);
1035 /* once for the path */ 1061 /* once for the path */
1036 free_extent_buffer(mid); 1062 free_extent_buffer(mid);
1037 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, 1063
1038 0, root->root_key.objectid, level); 1064 root_sub_used(root, mid->len);
1065 btrfs_free_tree_block(trans, root, mid, 0, 1);
1039 /* once for the root ptr */ 1066 /* once for the root ptr */
1040 free_extent_buffer(mid); 1067 free_extent_buffer(mid);
1041 return ret; 1068 return 0;
1042 } 1069 }
1043 if (btrfs_header_nritems(mid) > 1070 if (btrfs_header_nritems(mid) >
1044 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1088 if (wret < 0 && wret != -ENOSPC) 1115 if (wret < 0 && wret != -ENOSPC)
1089 ret = wret; 1116 ret = wret;
1090 if (btrfs_header_nritems(right) == 0) { 1117 if (btrfs_header_nritems(right) == 0) {
1091 u64 bytenr = right->start;
1092 u32 blocksize = right->len;
1093
1094 clean_tree_block(trans, root, right); 1118 clean_tree_block(trans, root, right);
1095 btrfs_tree_unlock(right); 1119 btrfs_tree_unlock(right);
1096 free_extent_buffer(right);
1097 right = NULL;
1098 wret = del_ptr(trans, root, path, level + 1, pslot + 1120 wret = del_ptr(trans, root, path, level + 1, pslot +
1099 1); 1121 1);
1100 if (wret) 1122 if (wret)
1101 ret = wret; 1123 ret = wret;
1102 wret = btrfs_free_tree_block(trans, root, 1124 root_sub_used(root, right->len);
1103 bytenr, blocksize, 0, 1125 btrfs_free_tree_block(trans, root, right, 0, 1);
1104 root->root_key.objectid, 1126 free_extent_buffer(right);
1105 level); 1127 right = NULL;
1106 if (wret)
1107 ret = wret;
1108 } else { 1128 } else {
1109 struct btrfs_disk_key right_key; 1129 struct btrfs_disk_key right_key;
1110 btrfs_node_key(right, &right_key, 0); 1130 btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1136 BUG_ON(wret == 1); 1156 BUG_ON(wret == 1);
1137 } 1157 }
1138 if (btrfs_header_nritems(mid) == 0) { 1158 if (btrfs_header_nritems(mid) == 0) {
1139 /* we've managed to empty the middle node, drop it */
1140 u64 bytenr = mid->start;
1141 u32 blocksize = mid->len;
1142
1143 clean_tree_block(trans, root, mid); 1159 clean_tree_block(trans, root, mid);
1144 btrfs_tree_unlock(mid); 1160 btrfs_tree_unlock(mid);
1145 free_extent_buffer(mid);
1146 mid = NULL;
1147 wret = del_ptr(trans, root, path, level + 1, pslot); 1161 wret = del_ptr(trans, root, path, level + 1, pslot);
1148 if (wret) 1162 if (wret)
1149 ret = wret; 1163 ret = wret;
1150 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, 1164 root_sub_used(root, mid->len);
1151 0, root->root_key.objectid, level); 1165 btrfs_free_tree_block(trans, root, mid, 0, 1);
1152 if (wret) 1166 free_extent_buffer(mid);
1153 ret = wret; 1167 mid = NULL;
1154 } else { 1168 } else {
1155 /* update the parent key to reflect our changes */ 1169 /* update the parent key to reflect our changes */
1156 struct btrfs_disk_key mid_key; 1170 struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1590 btrfs_release_path(NULL, p); 1604 btrfs_release_path(NULL, p);
1591 1605
1592 ret = -EAGAIN; 1606 ret = -EAGAIN;
1593 tmp = read_tree_block(root, blocknr, blocksize, gen); 1607 tmp = read_tree_block(root, blocknr, blocksize, 0);
1594 if (tmp) { 1608 if (tmp) {
1595 /* 1609 /*
1596 * If the read above didn't mark this buffer up to date, 1610 * If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
1740 p->nodes[level + 1], 1754 p->nodes[level + 1],
1741 p->slots[level + 1], &b); 1755 p->slots[level + 1], &b);
1742 if (err) { 1756 if (err) {
1743 free_extent_buffer(b);
1744 ret = err; 1757 ret = err;
1745 goto done; 1758 goto done;
1746 } 1759 }
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2076 if (IS_ERR(c)) 2089 if (IS_ERR(c))
2077 return PTR_ERR(c); 2090 return PTR_ERR(c);
2078 2091
2092 root_add_used(root, root->nodesize);
2093
2079 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 2094 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2080 btrfs_set_header_nritems(c, 1); 2095 btrfs_set_header_nritems(c, 1);
2081 btrfs_set_header_level(c, level); 2096 btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2134 int nritems; 2149 int nritems;
2135 2150
2136 BUG_ON(!path->nodes[level]); 2151 BUG_ON(!path->nodes[level]);
2152 btrfs_assert_tree_locked(path->nodes[level]);
2137 lower = path->nodes[level]; 2153 lower = path->nodes[level];
2138 nritems = btrfs_header_nritems(lower); 2154 nritems = btrfs_header_nritems(lower);
2139 BUG_ON(slot > nritems); 2155 BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2202 if (IS_ERR(split)) 2218 if (IS_ERR(split))
2203 return PTR_ERR(split); 2219 return PTR_ERR(split);
2204 2220
2221 root_add_used(root, root->nodesize);
2222
2205 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 2223 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2206 btrfs_set_header_level(split, btrfs_header_level(c)); 2224 btrfs_set_header_level(split, btrfs_header_level(c));
2207 btrfs_set_header_bytenr(split, split->start); 2225 btrfs_set_header_bytenr(split, split->start);
@@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2415 2433
2416 if (left_nritems) 2434 if (left_nritems)
2417 btrfs_mark_buffer_dirty(left); 2435 btrfs_mark_buffer_dirty(left);
2436 else
2437 clean_tree_block(trans, root, left);
2438
2418 btrfs_mark_buffer_dirty(right); 2439 btrfs_mark_buffer_dirty(right);
2419 2440
2420 btrfs_item_key(right, &disk_key, 0); 2441 btrfs_item_key(right, &disk_key, 0);
@@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2660 btrfs_mark_buffer_dirty(left); 2681 btrfs_mark_buffer_dirty(left);
2661 if (right_nritems) 2682 if (right_nritems)
2662 btrfs_mark_buffer_dirty(right); 2683 btrfs_mark_buffer_dirty(right);
2684 else
2685 clean_tree_block(trans, root, right);
2663 2686
2664 btrfs_item_key(right, &disk_key, 0); 2687 btrfs_item_key(right, &disk_key, 0);
2665 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2688 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2669 /* then fixup the leaf pointer in the path */ 2692 /* then fixup the leaf pointer in the path */
2670 if (path->slots[0] < push_items) { 2693 if (path->slots[0] < push_items) {
2671 path->slots[0] += old_left_nritems; 2694 path->slots[0] += old_left_nritems;
2672 if (btrfs_header_nritems(path->nodes[0]) == 0)
2673 clean_tree_block(trans, root, path->nodes[0]);
2674 btrfs_tree_unlock(path->nodes[0]); 2695 btrfs_tree_unlock(path->nodes[0]);
2675 free_extent_buffer(path->nodes[0]); 2696 free_extent_buffer(path->nodes[0]);
2676 path->nodes[0] = left; 2697 path->nodes[0] = left;
@@ -2932,10 +2953,10 @@ again:
2932 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2953 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2933 root->root_key.objectid, 2954 root->root_key.objectid,
2934 &disk_key, 0, l->start, 0); 2955 &disk_key, 0, l->start, 0);
2935 if (IS_ERR(right)) { 2956 if (IS_ERR(right))
2936 BUG_ON(1);
2937 return PTR_ERR(right); 2957 return PTR_ERR(right);
2938 } 2958
2959 root_add_used(root, root->leafsize);
2939 2960
2940 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 2961 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2941 btrfs_set_header_bytenr(right, right->start); 2962 btrfs_set_header_bytenr(right, right->start);
@@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3054 3075
3055 btrfs_set_path_blocking(path); 3076 btrfs_set_path_blocking(path);
3056 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3077 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3057 BUG_ON(ret); 3078 if (ret)
3079 goto err;
3058 3080
3059 path->keep_locks = 0; 3081 path->keep_locks = 0;
3060 btrfs_unlock_up_safe(path, 1); 3082 btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3796 */ 3818 */
3797 btrfs_unlock_up_safe(path, 0); 3819 btrfs_unlock_up_safe(path, 0);
3798 3820
3799 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, 3821 root_sub_used(root, leaf->len);
3800 0, root->root_key.objectid, 0); 3822
3801 return ret; 3823 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3824 return 0;
3802} 3825}
3803/* 3826/*
3804 * delete the item at the leaf level in path. If that empties 3827 * delete the item at the leaf level in path. If that empties
@@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3865 if (leaf == root->node) { 3888 if (leaf == root->node) {
3866 btrfs_set_header_level(leaf, 0); 3889 btrfs_set_header_level(leaf, 0);
3867 } else { 3890 } else {
3891 btrfs_set_path_blocking(path);
3892 clean_tree_block(trans, root, leaf);
3868 ret = btrfs_del_leaf(trans, root, path, leaf); 3893 ret = btrfs_del_leaf(trans, root, path, leaf);
3869 BUG_ON(ret); 3894 BUG_ON(ret);
3870 } 3895 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
34 34
35struct btrfs_trans_handle; 35struct btrfs_trans_handle;
36struct btrfs_transaction; 36struct btrfs_transaction;
37struct btrfs_pending_snapshot;
37extern struct kmem_cache *btrfs_trans_handle_cachep; 38extern struct kmem_cache *btrfs_trans_handle_cachep;
38extern struct kmem_cache *btrfs_transaction_cachep; 39extern struct kmem_cache *btrfs_transaction_cachep;
39extern struct kmem_cache *btrfs_bit_radix_cachep; 40extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
663#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 664#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
664#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 665#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
665#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 666#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
667#define BTRFS_NR_RAID_TYPES 5
666 668
667struct btrfs_block_group_item { 669struct btrfs_block_group_item {
668 __le64 used; 670 __le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
674 u64 flags; 676 u64 flags;
675 677
676 u64 total_bytes; /* total bytes in the space */ 678 u64 total_bytes; /* total bytes in the space */
677 u64 bytes_used; /* total bytes used on disk */ 679 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */
678 u64 bytes_pinned; /* total bytes pinned, will be freed when the 681 u64 bytes_pinned; /* total bytes pinned, will be freed when the
679 transaction finishes */ 682 transaction finishes */
680 u64 bytes_reserved; /* total bytes the allocator has reserved for 683 u64 bytes_reserved; /* total bytes the allocator has reserved for
681 current allocations */ 684 current allocations */
682 u64 bytes_readonly; /* total bytes that are read only */ 685 u64 bytes_readonly; /* total bytes that are read only */
683 u64 bytes_super; /* total bytes reserved for the super blocks */ 686
684 u64 bytes_root; /* the number of bytes needed to commit a
685 transaction */
686 u64 bytes_may_use; /* number of bytes that may be used for 687 u64 bytes_may_use; /* number of bytes that may be used for
687 delalloc/allocations */ 688 delalloc/allocations */
688 u64 bytes_delalloc; /* number of bytes currently reserved for 689 u64 disk_used; /* total bytes used on disk */
689 delayed allocation */
690 690
691 int full; /* indicates that we cannot allocate any more 691 int full; /* indicates that we cannot allocate any more
692 chunks for this space */ 692 chunks for this space */
693 int force_alloc; /* set if we need to force a chunk alloc for 693 int force_alloc; /* set if we need to force a chunk alloc for
694 this space */ 694 this space */
695 int force_delalloc; /* make people start doing filemap_flush until
696 we're under a threshold */
697 695
698 struct list_head list; 696 struct list_head list;
699 697
700 /* for controlling how we free up space for allocations */
701 wait_queue_head_t allocate_wait;
702 wait_queue_head_t flush_wait;
703 int allocating_chunk;
704 int flushing;
705
706 /* for block groups in our same type */ 698 /* for block groups in our same type */
707 struct list_head block_groups; 699 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
708 spinlock_t lock; 700 spinlock_t lock;
709 struct rw_semaphore groups_sem; 701 struct rw_semaphore groups_sem;
710 atomic_t caching_threads; 702 atomic_t caching_threads;
711}; 703};
712 704
705struct btrfs_block_rsv {
706 u64 size;
707 u64 reserved;
708 u64 freed[2];
709 struct btrfs_space_info *space_info;
710 struct list_head list;
711 spinlock_t lock;
712 atomic_t usage;
713 unsigned int priority:8;
714 unsigned int durable:1;
715 unsigned int refill_used:1;
716 unsigned int full:1;
717};
718
713/* 719/*
714 * free clusters are used to claim free space in relatively large chunks, 720 * free clusters are used to claim free space in relatively large chunks,
715 * allowing us to do less seeky writes. They are used for all metadata 721 * allowing us to do less seeky writes. They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
760 spinlock_t lock; 766 spinlock_t lock;
761 u64 pinned; 767 u64 pinned;
762 u64 reserved; 768 u64 reserved;
769 u64 reserved_pinned;
763 u64 bytes_super; 770 u64 bytes_super;
764 u64 flags; 771 u64 flags;
765 u64 sectorsize; 772 u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
825 /* logical->physical extent mapping */ 832 /* logical->physical extent mapping */
826 struct btrfs_mapping_tree mapping_tree; 833 struct btrfs_mapping_tree mapping_tree;
827 834
835 /* block reservation for extent, checksum and root tree */
836 struct btrfs_block_rsv global_block_rsv;
837 /* block reservation for delay allocation */
838 struct btrfs_block_rsv delalloc_block_rsv;
839 /* block reservation for metadata operations */
840 struct btrfs_block_rsv trans_block_rsv;
841 /* block reservation for chunk tree */
842 struct btrfs_block_rsv chunk_block_rsv;
843
844 struct btrfs_block_rsv empty_block_rsv;
845
846 /* list of block reservations that cross multiple transactions */
847 struct list_head durable_block_rsv_list;
848
849 struct mutex durable_block_rsv_mutex;
850
828 u64 generation; 851 u64 generation;
829 u64 last_trans_committed; 852 u64 last_trans_committed;
830 853
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
927 struct btrfs_workers endio_meta_write_workers; 950 struct btrfs_workers endio_meta_write_workers;
928 struct btrfs_workers endio_write_workers; 951 struct btrfs_workers endio_write_workers;
929 struct btrfs_workers submit_workers; 952 struct btrfs_workers submit_workers;
930 struct btrfs_workers enospc_workers;
931 /* 953 /*
932 * fixup workers take dirty pages that didn't properly go through 954 * fixup workers take dirty pages that didn't properly go through
933 * the cow mechanism and make them safe to write. It happens 955 * the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
943 int do_barriers; 965 int do_barriers;
944 int closing; 966 int closing;
945 int log_root_recovering; 967 int log_root_recovering;
968 int enospc_unlink;
946 969
947 u64 total_pinned; 970 u64 total_pinned;
948 971
@@ -1012,6 +1035,9 @@ struct btrfs_root {
1012 struct completion kobj_unregister; 1035 struct completion kobj_unregister;
1013 struct mutex objectid_mutex; 1036 struct mutex objectid_mutex;
1014 1037
1038 spinlock_t accounting_lock;
1039 struct btrfs_block_rsv *block_rsv;
1040
1015 struct mutex log_mutex; 1041 struct mutex log_mutex;
1016 wait_queue_head_t log_writer_wait; 1042 wait_queue_head_t log_writer_wait;
1017 wait_queue_head_t log_commit_wait[2]; 1043 wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
1043 int ref_cows; 1069 int ref_cows;
1044 int track_dirty; 1070 int track_dirty;
1045 int in_radix; 1071 int in_radix;
1046 int clean_orphans;
1047 1072
1048 u64 defrag_trans_start; 1073 u64 defrag_trans_start;
1049 struct btrfs_key defrag_progress; 1074 struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
1057 1082
1058 struct list_head root_list; 1083 struct list_head root_list;
1059 1084
1060 spinlock_t list_lock; 1085 spinlock_t orphan_lock;
1061 struct list_head orphan_list; 1086 struct list_head orphan_list;
1087 struct btrfs_block_rsv *orphan_block_rsv;
1088 int orphan_item_inserted;
1089 int orphan_cleanup_state;
1062 1090
1063 spinlock_t inode_lock; 1091 spinlock_t inode_lock;
1064 /* red-black tree that keeps track of in-memory inodes */ 1092 /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1965int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1966 struct btrfs_root *root, unsigned long count); 1994 struct btrfs_root *root, unsigned long count);
1967int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1995int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1996int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, u64 bytenr,
1998 u64 num_bytes, u64 *refs, u64 *flags);
1968int btrfs_pin_extent(struct btrfs_root *root, 1999int btrfs_pin_extent(struct btrfs_root *root,
1969 u64 bytenr, u64 num, int reserved); 2000 u64 bytenr, u64 num, int reserved);
1970int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 2001int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1984 u64 parent, u64 root_objectid, 2015 u64 parent, u64 root_objectid,
1985 struct btrfs_disk_key *key, int level, 2016 struct btrfs_disk_key *key, int level,
1986 u64 hint, u64 empty_size); 2017 u64 hint, u64 empty_size);
1987int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2018void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1988 struct btrfs_root *root, 2019 struct btrfs_root *root,
1989 u64 bytenr, u32 blocksize, 2020 struct extent_buffer *buf,
1990 u64 parent, u64 root_objectid, int level); 2021 u64 parent, int last_ref);
1991struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2022struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1992 struct btrfs_root *root, 2023 struct btrfs_root *root,
1993 u64 bytenr, u32 blocksize, 2024 u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2041 u64 size); 2072 u64 size);
2042int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2073int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2043 struct btrfs_root *root, u64 group_start); 2074 struct btrfs_root *root, u64 group_start);
2044int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
2045 struct btrfs_block_group_cache *group);
2046
2047u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2075u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2048void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2076void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2049void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2077void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2050 2078int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2051int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); 2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2052int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); 2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2053int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2081 struct btrfs_root *root,
2054 struct inode *inode, int num_items); 2082 int num_items, int *retries);
2055int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2056 struct inode *inode, int num_items); 2084 struct btrfs_root *root);
2057int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
2058 u64 bytes); 2086 struct inode *inode);
2059void btrfs_free_reserved_data_space(struct btrfs_root *root, 2087void btrfs_orphan_release_metadata(struct inode *inode);
2060 struct inode *inode, u64 bytes); 2088int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
2061void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 2089 struct btrfs_pending_snapshot *pending);
2062 u64 bytes); 2090int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2063void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2091void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2064 u64 bytes); 2092int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2093void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2094void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2095struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2096void btrfs_free_block_rsv(struct btrfs_root *root,
2097 struct btrfs_block_rsv *rsv);
2098void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2099 struct btrfs_block_rsv *rsv);
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv,
2107 u64 min_reserved, int min_factor);
2108int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2109 struct btrfs_block_rsv *dst_rsv,
2110 u64 num_bytes);
2111void btrfs_block_rsv_release(struct btrfs_root *root,
2112 struct btrfs_block_rsv *block_rsv,
2113 u64 num_bytes);
2114int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache);
2065/* ctree.c */ 2118/* ctree.c */
2066int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2067 int level, int *slot); 2120 int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2152int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2205int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2153int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2206int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2154int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2207int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2155int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); 2208int btrfs_drop_snapshot(struct btrfs_root *root,
2209 struct btrfs_block_rsv *block_rsv, int update_ref);
2156int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2210int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2157 struct btrfs_root *root, 2211 struct btrfs_root *root,
2158 struct extent_buffer *node, 2212 struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
2245 struct btrfs_root *root, 2299 struct btrfs_root *root,
2246 const char *name, int name_len, 2300 const char *name, int name_len,
2247 u64 inode_objectid, u64 ref_objectid, u64 *index); 2301 u64 inode_objectid, u64 ref_objectid, u64 *index);
2302struct btrfs_inode_ref *
2303btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
2304 struct btrfs_root *root,
2305 struct btrfs_path *path,
2306 const char *name, int name_len,
2307 u64 inode_objectid, u64 ref_objectid, int mod);
2248int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 2308int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root, 2309 struct btrfs_root *root,
2250 struct btrfs_path *path, u64 objectid); 2310 struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2257 struct btrfs_root *root, u64 bytenr, u64 len); 2317 struct btrfs_root *root, u64 bytenr, u64 len);
2258int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2259 struct bio *bio, u32 *dst); 2319 struct bio *bio, u32 *dst);
2320int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2321 struct bio *bio, u64 logical_offset, u32 *dst);
2260int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2322int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2261 struct btrfs_root *root, 2323 struct btrfs_root *root,
2262 u64 objectid, u64 pos, 2324 u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2311 u32 min_type); 2373 u32 min_type);
2312 2374
2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state); 2378 struct extent_state **cached_state);
2316int btrfs_writepages(struct address_space *mapping, 2379int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2349int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2412int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2350int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2413int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2351void btrfs_orphan_cleanup(struct btrfs_root *root); 2414void btrfs_orphan_cleanup(struct btrfs_root *root);
2415void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2416 struct btrfs_pending_snapshot *pending,
2417 u64 *bytes_to_reserve);
2418void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2419 struct btrfs_pending_snapshot *pending);
2420void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root);
2352int btrfs_cont_expand(struct inode *inode, loff_t size); 2422int btrfs_cont_expand(struct inode *inode, loff_t size);
2353int btrfs_invalidate_inodes(struct btrfs_root *root); 2423int btrfs_invalidate_inodes(struct btrfs_root *root);
2354void btrfs_add_delayed_iput(struct inode *inode); 2424void btrfs_add_delayed_iput(struct inode *inode);
2355void btrfs_run_delayed_iputs(struct btrfs_root *root); 2425void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint);
2356extern const struct dentry_operations btrfs_dentry_operations; 2429extern const struct dentry_operations btrfs_dentry_operations;
2357 2430
2358/* ioctl.c */ 2431/* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
2361void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2434void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2362 2435
2363/* file.c */ 2436/* file.c */
2364int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2437int btrfs_sync_file(struct file *file, int datasync);
2365int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2438int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2366 int skip_pinned); 2439 int skip_pinned);
2367int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2440int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2409 struct btrfs_root *root); 2482 struct btrfs_root *root);
2410int btrfs_recover_relocation(struct btrfs_root *root); 2483int btrfs_recover_relocation(struct btrfs_root *root);
2411int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 2484int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2485void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
2486 struct btrfs_root *root, struct extent_buffer *buf,
2487 struct extent_buffer *cow);
2488void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2489 struct btrfs_pending_snapshot *pending,
2490 u64 *bytes_to_reserve);
2491void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2492 struct btrfs_pending_snapshot *pending);
2412#endif 2493#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
319} 319}
320 320
321/* 321/*
322 * helper function to lookup reference count and flags of extent.
323 *
324 * the head node for delayed ref is used to store the sum of all the
325 * reference count modifications queued up in the rbtree. the head
326 * node may also store the extent flags to set. This way you can check
327 * to see what the reference count and extent flags would be if all of
328 * the delayed refs are not processed.
329 */
330int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
331 struct btrfs_root *root, u64 bytenr,
332 u64 num_bytes, u64 *refs, u64 *flags)
333{
334 struct btrfs_delayed_ref_node *ref;
335 struct btrfs_delayed_ref_head *head;
336 struct btrfs_delayed_ref_root *delayed_refs;
337 struct btrfs_path *path;
338 struct btrfs_extent_item *ei;
339 struct extent_buffer *leaf;
340 struct btrfs_key key;
341 u32 item_size;
342 u64 num_refs;
343 u64 extent_flags;
344 int ret;
345
346 path = btrfs_alloc_path();
347 if (!path)
348 return -ENOMEM;
349
350 key.objectid = bytenr;
351 key.type = BTRFS_EXTENT_ITEM_KEY;
352 key.offset = num_bytes;
353 delayed_refs = &trans->transaction->delayed_refs;
354again:
355 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
356 &key, path, 0, 0);
357 if (ret < 0)
358 goto out;
359
360 if (ret == 0) {
361 leaf = path->nodes[0];
362 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
363 if (item_size >= sizeof(*ei)) {
364 ei = btrfs_item_ptr(leaf, path->slots[0],
365 struct btrfs_extent_item);
366 num_refs = btrfs_extent_refs(leaf, ei);
367 extent_flags = btrfs_extent_flags(leaf, ei);
368 } else {
369#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
370 struct btrfs_extent_item_v0 *ei0;
371 BUG_ON(item_size != sizeof(*ei0));
372 ei0 = btrfs_item_ptr(leaf, path->slots[0],
373 struct btrfs_extent_item_v0);
374 num_refs = btrfs_extent_refs_v0(leaf, ei0);
375 /* FIXME: this isn't correct for data */
376 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
377#else
378 BUG();
379#endif
380 }
381 BUG_ON(num_refs == 0);
382 } else {
383 num_refs = 0;
384 extent_flags = 0;
385 ret = 0;
386 }
387
388 spin_lock(&delayed_refs->lock);
389 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
390 if (ref) {
391 head = btrfs_delayed_node_to_head(ref);
392 if (!mutex_trylock(&head->mutex)) {
393 atomic_inc(&ref->refs);
394 spin_unlock(&delayed_refs->lock);
395
396 btrfs_release_path(root->fs_info->extent_root, path);
397
398 mutex_lock(&head->mutex);
399 mutex_unlock(&head->mutex);
400 btrfs_put_delayed_ref(ref);
401 goto again;
402 }
403 if (head->extent_op && head->extent_op->update_flags)
404 extent_flags |= head->extent_op->flags_to_set;
405 else
406 BUG_ON(num_refs == 0);
407
408 num_refs += ref->ref_mod;
409 mutex_unlock(&head->mutex);
410 }
411 WARN_ON(num_refs == 0);
412 if (refs)
413 *refs = num_refs;
414 if (flags)
415 *flags = extent_flags;
416out:
417 spin_unlock(&delayed_refs->lock);
418 btrfs_free_path(path);
419 return ret;
420}
421
422/*
423 * helper function to update an extent delayed ref in the 322 * helper function to update an extent delayed ref in the
424 * rbtree. existing and update must both have the same 323 * rbtree. existing and update must both have the same
425 * bytenr and parent 324 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, u64 bytenr,
172 u64 num_bytes, u64 *refs, u64 *flags);
173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
174 u64 bytenr, u64 num_bytes, u64 orig_parent, 171 u64 bytenr, u64 num_bytes, u64 orig_parent,
175 u64 parent, u64 orig_ref_root, u64 ref_root, 172 u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..f3b287c22caf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
74 int rw; 74 int rw;
75 int mirror_num; 75 int mirror_num;
76 unsigned long bio_flags; 76 unsigned long bio_flags;
77 /*
78 * bio_offset is optional, can be used if the pages in the bio
79 * can't tell us where in the file the bio should go
80 */
81 u64 bio_offset;
77 struct btrfs_work work; 82 struct btrfs_work work;
78}; 83};
79 84
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
534 async = container_of(work, struct async_submit_bio, work); 539 async = container_of(work, struct async_submit_bio, work);
535 fs_info = BTRFS_I(async->inode)->root->fs_info; 540 fs_info = BTRFS_I(async->inode)->root->fs_info;
536 async->submit_bio_start(async->inode, async->rw, async->bio, 541 async->submit_bio_start(async->inode, async->rw, async->bio,
537 async->mirror_num, async->bio_flags); 542 async->mirror_num, async->bio_flags,
543 async->bio_offset);
538} 544}
539 545
540static void run_one_async_done(struct btrfs_work *work) 546static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
556 wake_up(&fs_info->async_submit_wait); 562 wake_up(&fs_info->async_submit_wait);
557 563
558 async->submit_bio_done(async->inode, async->rw, async->bio, 564 async->submit_bio_done(async->inode, async->rw, async->bio,
559 async->mirror_num, async->bio_flags); 565 async->mirror_num, async->bio_flags,
566 async->bio_offset);
560} 567}
561 568
562static void run_one_async_free(struct btrfs_work *work) 569static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
570int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 577int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
571 int rw, struct bio *bio, int mirror_num, 578 int rw, struct bio *bio, int mirror_num,
572 unsigned long bio_flags, 579 unsigned long bio_flags,
580 u64 bio_offset,
573 extent_submit_bio_hook_t *submit_bio_start, 581 extent_submit_bio_hook_t *submit_bio_start,
574 extent_submit_bio_hook_t *submit_bio_done) 582 extent_submit_bio_hook_t *submit_bio_done)
575{ 583{
@@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
592 600
593 async->work.flags = 0; 601 async->work.flags = 0;
594 async->bio_flags = bio_flags; 602 async->bio_flags = bio_flags;
603 async->bio_offset = bio_offset;
595 604
596 atomic_inc(&fs_info->nr_async_submits); 605 atomic_inc(&fs_info->nr_async_submits);
597 606
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
627 636
628static int __btree_submit_bio_start(struct inode *inode, int rw, 637static int __btree_submit_bio_start(struct inode *inode, int rw,
629 struct bio *bio, int mirror_num, 638 struct bio *bio, int mirror_num,
630 unsigned long bio_flags) 639 unsigned long bio_flags,
640 u64 bio_offset)
631{ 641{
632 /* 642 /*
633 * when we're called for a write, we're already in the async 643 * when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
638} 648}
639 649
640static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 650static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
641 int mirror_num, unsigned long bio_flags) 651 int mirror_num, unsigned long bio_flags,
652 u64 bio_offset)
642{ 653{
643 /* 654 /*
644 * when we're called for a write, we're already in the async 655 * when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
648} 659}
649 660
650static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 661static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
651 int mirror_num, unsigned long bio_flags) 662 int mirror_num, unsigned long bio_flags,
663 u64 bio_offset)
652{ 664{
653 int ret; 665 int ret;
654 666
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
671 */ 683 */
672 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 684 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
673 inode, rw, bio, mirror_num, 0, 685 inode, rw, bio, mirror_num, 0,
686 bio_offset,
674 __btree_submit_bio_start, 687 __btree_submit_bio_start,
675 __btree_submit_bio_done); 688 __btree_submit_bio_done);
676} 689}
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
894 root->ref_cows = 0; 907 root->ref_cows = 0;
895 root->track_dirty = 0; 908 root->track_dirty = 0;
896 root->in_radix = 0; 909 root->in_radix = 0;
897 root->clean_orphans = 0; 910 root->orphan_item_inserted = 0;
911 root->orphan_cleanup_state = 0;
898 912
899 root->fs_info = fs_info; 913 root->fs_info = fs_info;
900 root->objectid = objectid; 914 root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
903 root->name = NULL; 917 root->name = NULL;
904 root->in_sysfs = 0; 918 root->in_sysfs = 0;
905 root->inode_tree = RB_ROOT; 919 root->inode_tree = RB_ROOT;
920 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL;
906 922
907 INIT_LIST_HEAD(&root->dirty_list); 923 INIT_LIST_HEAD(&root->dirty_list);
908 INIT_LIST_HEAD(&root->orphan_list); 924 INIT_LIST_HEAD(&root->orphan_list);
909 INIT_LIST_HEAD(&root->root_list); 925 INIT_LIST_HEAD(&root->root_list);
910 spin_lock_init(&root->node_lock); 926 spin_lock_init(&root->node_lock);
911 spin_lock_init(&root->list_lock); 927 spin_lock_init(&root->orphan_lock);
912 spin_lock_init(&root->inode_lock); 928 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock);
913 mutex_init(&root->objectid_mutex); 930 mutex_init(&root->objectid_mutex);
914 mutex_init(&root->log_mutex); 931 mutex_init(&root->log_mutex);
915 init_waitqueue_head(&root->log_writer_wait); 932 init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
968 return 0; 985 return 0;
969} 986}
970 987
971int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
972 struct btrfs_fs_info *fs_info)
973{
974 struct extent_buffer *eb;
975 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
976 u64 start = 0;
977 u64 end = 0;
978 int ret;
979
980 if (!log_root_tree)
981 return 0;
982
983 while (1) {
984 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
985 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
986 if (ret)
987 break;
988
989 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
990 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
991 }
992 eb = fs_info->log_root_tree->node;
993
994 WARN_ON(btrfs_header_level(eb) != 0);
995 WARN_ON(btrfs_header_nritems(eb) != 0);
996
997 ret = btrfs_free_reserved_extent(fs_info->tree_root,
998 eb->start, eb->len);
999 BUG_ON(ret);
1000
1001 free_extent_buffer(eb);
1002 kfree(fs_info->log_root_tree);
1003 fs_info->log_root_tree = NULL;
1004 return 0;
1005}
1006
1007static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 988static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1008 struct btrfs_fs_info *fs_info) 989 struct btrfs_fs_info *fs_info)
1009{ 990{
@@ -1191,19 +1172,23 @@ again:
1191 if (root) 1172 if (root)
1192 return root; 1173 return root;
1193 1174
1194 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1195 if (ret == 0)
1196 ret = -ENOENT;
1197 if (ret < 0)
1198 return ERR_PTR(ret);
1199
1200 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1175 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1201 if (IS_ERR(root)) 1176 if (IS_ERR(root))
1202 return root; 1177 return root;
1203 1178
1204 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1205 set_anon_super(&root->anon_super, NULL); 1179 set_anon_super(&root->anon_super, NULL);
1206 1180
1181 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT;
1183 goto fail;
1184 }
1185
1186 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1187 if (ret < 0)
1188 goto fail;
1189 if (ret == 0)
1190 root->orphan_item_inserted = 1;
1191
1207 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1192 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1208 if (ret) 1193 if (ret)
1209 goto fail; 1194 goto fail;
@@ -1212,10 +1197,9 @@ again:
1212 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1197 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1213 (unsigned long)root->root_key.objectid, 1198 (unsigned long)root->root_key.objectid,
1214 root); 1199 root);
1215 if (ret == 0) { 1200 if (ret == 0)
1216 root->in_radix = 1; 1201 root->in_radix = 1;
1217 root->clean_orphans = 1; 1202
1218 }
1219 spin_unlock(&fs_info->fs_roots_radix_lock); 1203 spin_unlock(&fs_info->fs_roots_radix_lock);
1220 radix_tree_preload_end(); 1204 radix_tree_preload_end();
1221 if (ret) { 1205 if (ret) {
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
1461 struct btrfs_root *root = arg; 1445 struct btrfs_root *root = arg;
1462 1446
1463 do { 1447 do {
1464 smp_mb();
1465 if (root->fs_info->closing)
1466 break;
1467
1468 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1448 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1469 1449
1470 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1450 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
1477 if (freezing(current)) { 1457 if (freezing(current)) {
1478 refrigerator(); 1458 refrigerator();
1479 } else { 1459 } else {
1480 smp_mb();
1481 if (root->fs_info->closing)
1482 break;
1483 set_current_state(TASK_INTERRUPTIBLE); 1460 set_current_state(TASK_INTERRUPTIBLE);
1484 schedule(); 1461 if (!kthread_should_stop())
1462 schedule();
1485 __set_current_state(TASK_RUNNING); 1463 __set_current_state(TASK_RUNNING);
1486 } 1464 }
1487 } while (!kthread_should_stop()); 1465 } while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
1493 struct btrfs_root *root = arg; 1471 struct btrfs_root *root = arg;
1494 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1495 struct btrfs_transaction *cur; 1473 struct btrfs_transaction *cur;
1474 u64 transid;
1496 unsigned long now; 1475 unsigned long now;
1497 unsigned long delay; 1476 unsigned long delay;
1498 int ret; 1477 int ret;
1499 1478
1500 do { 1479 do {
1501 smp_mb();
1502 if (root->fs_info->closing)
1503 break;
1504
1505 delay = HZ * 30; 1480 delay = HZ * 30;
1506 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1507 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1482 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1508 1483
1509 mutex_lock(&root->fs_info->trans_mutex); 1484 spin_lock(&root->fs_info->new_trans_lock);
1510 cur = root->fs_info->running_transaction; 1485 cur = root->fs_info->running_transaction;
1511 if (!cur) { 1486 if (!cur) {
1512 mutex_unlock(&root->fs_info->trans_mutex); 1487 spin_unlock(&root->fs_info->new_trans_lock);
1513 goto sleep; 1488 goto sleep;
1514 } 1489 }
1515 1490
1516 now = get_seconds(); 1491 now = get_seconds();
1517 if (now < cur->start_time || now - cur->start_time < 30) { 1492 if (!cur->blocked &&
1518 mutex_unlock(&root->fs_info->trans_mutex); 1493 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock);
1519 delay = HZ * 5; 1495 delay = HZ * 5;
1520 goto sleep; 1496 goto sleep;
1521 } 1497 }
1522 mutex_unlock(&root->fs_info->trans_mutex); 1498 transid = cur->transid;
1523 trans = btrfs_start_transaction(root, 1); 1499 spin_unlock(&root->fs_info->new_trans_lock);
1524 ret = btrfs_commit_transaction(trans, root);
1525 1500
1501 trans = btrfs_join_transaction(root, 1);
1502 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret);
1505 } else {
1506 btrfs_end_transaction(trans, root);
1507 }
1526sleep: 1508sleep:
1527 wake_up_process(root->fs_info->cleaner_kthread); 1509 wake_up_process(root->fs_info->cleaner_kthread);
1528 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1510 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
1530 if (freezing(current)) { 1512 if (freezing(current)) {
1531 refrigerator(); 1513 refrigerator();
1532 } else { 1514 } else {
1533 if (root->fs_info->closing)
1534 break;
1535 set_current_state(TASK_INTERRUPTIBLE); 1515 set_current_state(TASK_INTERRUPTIBLE);
1536 schedule_timeout(delay); 1516 if (!kthread_should_stop() &&
1517 !btrfs_transaction_blocked(root->fs_info))
1518 schedule_timeout(delay);
1537 __set_current_state(TASK_RUNNING); 1519 __set_current_state(TASK_RUNNING);
1538 } 1520 }
1539 } while (!kthread_should_stop()); 1521 } while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1620 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1602 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1621 INIT_LIST_HEAD(&fs_info->space_info); 1603 INIT_LIST_HEAD(&fs_info->space_info);
1622 btrfs_mapping_init(&fs_info->mapping_tree); 1604 btrfs_mapping_init(&fs_info->mapping_tree);
1605 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1606 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1607 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1608 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1609 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1610 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1611 mutex_init(&fs_info->durable_block_rsv_mutex);
1623 atomic_set(&fs_info->nr_async_submits, 0); 1612 atomic_set(&fs_info->nr_async_submits, 0);
1624 atomic_set(&fs_info->async_delalloc_pages, 0); 1613 atomic_set(&fs_info->async_delalloc_pages, 0);
1625 atomic_set(&fs_info->async_submit_draining, 0); 1614 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1759 min_t(u64, fs_devices->num_devices, 1748 min_t(u64, fs_devices->num_devices,
1760 fs_info->thread_pool_size), 1749 fs_info->thread_pool_size),
1761 &fs_info->generic_worker); 1750 &fs_info->generic_worker);
1762 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1763 fs_info->thread_pool_size,
1764 &fs_info->generic_worker);
1765 1751
1766 /* a higher idle thresh on the submit workers makes it much more 1752 /* a higher idle thresh on the submit workers makes it much more
1767 * likely that bios will be send down in a sane order to the 1753 * likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1809 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1810 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1811 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1812 btrfs_start_workers(&fs_info->enospc_workers, 1);
1813 1798
1814 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1815 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1912 1897
1913 csum_root->track_dirty = 1; 1898 csum_root->track_dirty = 1;
1914 1899
1900 fs_info->generation = generation;
1901 fs_info->last_trans_committed = generation;
1902 fs_info->data_alloc_profile = (u64)-1;
1903 fs_info->metadata_alloc_profile = (u64)-1;
1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905
1915 ret = btrfs_read_block_groups(extent_root); 1906 ret = btrfs_read_block_groups(extent_root);
1916 if (ret) { 1907 if (ret) {
1917 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1918 goto fail_block_groups; 1909 goto fail_block_groups;
1919 } 1910 }
1920 1911
1921 fs_info->generation = generation;
1922 fs_info->last_trans_committed = generation;
1923 fs_info->data_alloc_profile = (u64)-1;
1924 fs_info->metadata_alloc_profile = (u64)-1;
1925 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1926 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1912 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1927 "btrfs-cleaner"); 1913 "btrfs-cleaner");
1928 if (IS_ERR(fs_info->cleaner_kthread)) 1914 if (IS_ERR(fs_info->cleaner_kthread))
@@ -1977,6 +1963,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1977 BUG_ON(ret); 1963 BUG_ON(ret);
1978 1964
1979 if (!(sb->s_flags & MS_RDONLY)) { 1965 if (!(sb->s_flags & MS_RDONLY)) {
1966 ret = btrfs_cleanup_fs_roots(fs_info);
1967 BUG_ON(ret);
1968
1980 ret = btrfs_recover_relocation(tree_root); 1969 ret = btrfs_recover_relocation(tree_root);
1981 if (ret < 0) { 1970 if (ret < 0) {
1982 printk(KERN_WARNING 1971 printk(KERN_WARNING
@@ -2040,7 +2029,6 @@ fail_sb_buffer:
2040 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2029 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2041 btrfs_stop_workers(&fs_info->endio_write_workers); 2030 btrfs_stop_workers(&fs_info->endio_write_workers);
2042 btrfs_stop_workers(&fs_info->submit_workers); 2031 btrfs_stop_workers(&fs_info->submit_workers);
2043 btrfs_stop_workers(&fs_info->enospc_workers);
2044fail_iput: 2032fail_iput:
2045 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2033 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2046 iput(fs_info->btree_inode); 2034 iput(fs_info->btree_inode);
@@ -2405,11 +2393,11 @@ int btrfs_commit_super(struct btrfs_root *root)
2405 down_write(&root->fs_info->cleanup_work_sem); 2393 down_write(&root->fs_info->cleanup_work_sem);
2406 up_write(&root->fs_info->cleanup_work_sem); 2394 up_write(&root->fs_info->cleanup_work_sem);
2407 2395
2408 trans = btrfs_start_transaction(root, 1); 2396 trans = btrfs_join_transaction(root, 1);
2409 ret = btrfs_commit_transaction(trans, root); 2397 ret = btrfs_commit_transaction(trans, root);
2410 BUG_ON(ret); 2398 BUG_ON(ret);
2411 /* run commit again to drop the original snapshot */ 2399 /* run commit again to drop the original snapshot */
2412 trans = btrfs_start_transaction(root, 1); 2400 trans = btrfs_join_transaction(root, 1);
2413 btrfs_commit_transaction(trans, root); 2401 btrfs_commit_transaction(trans, root);
2414 ret = btrfs_write_and_wait_transaction(NULL, root); 2402 ret = btrfs_write_and_wait_transaction(NULL, root);
2415 BUG_ON(ret); 2403 BUG_ON(ret);
@@ -2426,15 +2414,15 @@ int close_ctree(struct btrfs_root *root)
2426 fs_info->closing = 1; 2414 fs_info->closing = 1;
2427 smp_mb(); 2415 smp_mb();
2428 2416
2429 kthread_stop(root->fs_info->transaction_kthread);
2430 kthread_stop(root->fs_info->cleaner_kthread);
2431
2432 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2417 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2433 ret = btrfs_commit_super(root); 2418 ret = btrfs_commit_super(root);
2434 if (ret) 2419 if (ret)
2435 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2420 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2436 } 2421 }
2437 2422
2423 kthread_stop(root->fs_info->transaction_kthread);
2424 kthread_stop(root->fs_info->cleaner_kthread);
2425
2438 fs_info->closing = 2; 2426 fs_info->closing = 2;
2439 smp_mb(); 2427 smp_mb();
2440 2428
@@ -2473,7 +2461,6 @@ int close_ctree(struct btrfs_root *root)
2473 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2461 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2474 btrfs_stop_workers(&fs_info->endio_write_workers); 2462 btrfs_stop_workers(&fs_info->endio_write_workers);
2475 btrfs_stop_workers(&fs_info->submit_workers); 2463 btrfs_stop_workers(&fs_info->submit_workers);
2476 btrfs_stop_workers(&fs_info->enospc_workers);
2477 2464
2478 btrfs_close_devices(fs_info->fs_devices); 2465 btrfs_close_devices(fs_info->fs_devices);
2479 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2466 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 87 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
89 int rw, struct bio *bio, int mirror_num, 89 int rw, struct bio *bio, int mirror_num,
90 unsigned long bio_flags, 90 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 91 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 92 extent_submit_bio_hook_t *submit_bio_done);
93 93
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 96int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
98int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
99 struct btrfs_fs_info *fs_info);
100int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 98int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101 struct btrfs_fs_info *fs_info); 99 struct btrfs_fs_info *fs_info);
102int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 100int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6a4f459ad76..b9080d71991a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
35 35
36static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 37 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc);
39 int mark_free); 39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve, int sinfo);
41 u64 num_bytes, int reserve);
42static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
43 struct btrfs_root *root, 42 struct btrfs_root *root,
44 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
61static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
63 u64 flags, int force); 62 u64 flags, int force);
64static int pin_down_bytes(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 struct btrfs_path *path,
67 u64 bytenr, u64 num_bytes,
68 int is_data, int reserved,
69 struct extent_buffer **must_clean);
70static int find_next_key(struct btrfs_path *path, int level, 63static int find_next_key(struct btrfs_path *path, int level,
71 struct btrfs_key *key); 64 struct btrfs_key *key);
72static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
91 84
92void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
93{ 86{
94 if (atomic_dec_and_test(&cache->count)) 87 if (atomic_dec_and_test(&cache->count)) {
88 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0);
95 kfree(cache); 91 kfree(cache);
92 }
96} 93}
97 94
98/* 95/*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
319 316
320 exclude_super_stripes(extent_root, block_group); 317 exclude_super_stripes(extent_root, block_group);
321 spin_lock(&block_group->space_info->lock); 318 spin_lock(&block_group->space_info->lock);
322 block_group->space_info->bytes_super += block_group->bytes_super; 319 block_group->space_info->bytes_readonly += block_group->bytes_super;
323 spin_unlock(&block_group->space_info->lock); 320 spin_unlock(&block_group->space_info->lock);
324 321
325 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
507 struct list_head *head = &info->space_info; 504 struct list_head *head = &info->space_info;
508 struct btrfs_space_info *found; 505 struct btrfs_space_info *found;
509 506
507 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
508 BTRFS_BLOCK_GROUP_METADATA;
509
510 rcu_read_lock(); 510 rcu_read_lock();
511 list_for_each_entry_rcu(found, head, list) { 511 list_for_each_entry_rcu(found, head, list) {
512 if (found->flags == flags) { 512 if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
610} 610}
611 611
612/* 612/*
613 * helper function to lookup reference count and flags of extent.
614 *
615 * the head node for delayed ref is used to store the sum of all the
616 * reference count modifications queued up in the rbtree. the head
617 * node may also store the extent flags to set. This way you can check
618 * to see what the reference count and extent flags would be if all of
619 * the delayed refs are not processed.
620 */
621int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
622 struct btrfs_root *root, u64 bytenr,
623 u64 num_bytes, u64 *refs, u64 *flags)
624{
625 struct btrfs_delayed_ref_head *head;
626 struct btrfs_delayed_ref_root *delayed_refs;
627 struct btrfs_path *path;
628 struct btrfs_extent_item *ei;
629 struct extent_buffer *leaf;
630 struct btrfs_key key;
631 u32 item_size;
632 u64 num_refs;
633 u64 extent_flags;
634 int ret;
635
636 path = btrfs_alloc_path();
637 if (!path)
638 return -ENOMEM;
639
640 key.objectid = bytenr;
641 key.type = BTRFS_EXTENT_ITEM_KEY;
642 key.offset = num_bytes;
643 if (!trans) {
644 path->skip_locking = 1;
645 path->search_commit_root = 1;
646 }
647again:
648 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
649 &key, path, 0, 0);
650 if (ret < 0)
651 goto out_free;
652
653 if (ret == 0) {
654 leaf = path->nodes[0];
655 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
656 if (item_size >= sizeof(*ei)) {
657 ei = btrfs_item_ptr(leaf, path->slots[0],
658 struct btrfs_extent_item);
659 num_refs = btrfs_extent_refs(leaf, ei);
660 extent_flags = btrfs_extent_flags(leaf, ei);
661 } else {
662#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
663 struct btrfs_extent_item_v0 *ei0;
664 BUG_ON(item_size != sizeof(*ei0));
665 ei0 = btrfs_item_ptr(leaf, path->slots[0],
666 struct btrfs_extent_item_v0);
667 num_refs = btrfs_extent_refs_v0(leaf, ei0);
668 /* FIXME: this isn't correct for data */
669 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
670#else
671 BUG();
672#endif
673 }
674 BUG_ON(num_refs == 0);
675 } else {
676 num_refs = 0;
677 extent_flags = 0;
678 ret = 0;
679 }
680
681 if (!trans)
682 goto out;
683
684 delayed_refs = &trans->transaction->delayed_refs;
685 spin_lock(&delayed_refs->lock);
686 head = btrfs_find_delayed_ref_head(trans, bytenr);
687 if (head) {
688 if (!mutex_trylock(&head->mutex)) {
689 atomic_inc(&head->node.refs);
690 spin_unlock(&delayed_refs->lock);
691
692 btrfs_release_path(root->fs_info->extent_root, path);
693
694 mutex_lock(&head->mutex);
695 mutex_unlock(&head->mutex);
696 btrfs_put_delayed_ref(&head->node);
697 goto again;
698 }
699 if (head->extent_op && head->extent_op->update_flags)
700 extent_flags |= head->extent_op->flags_to_set;
701 else
702 BUG_ON(num_refs == 0);
703
704 num_refs += head->node.ref_mod;
705 mutex_unlock(&head->mutex);
706 }
707 spin_unlock(&delayed_refs->lock);
708out:
709 WARN_ON(num_refs == 0);
710 if (refs)
711 *refs = num_refs;
712 if (flags)
713 *flags = extent_flags;
714out_free:
715 btrfs_free_path(path);
716 return ret;
717}
718
719/*
613 * Back reference rules. Back refs have three main goals: 720 * Back reference rules. Back refs have three main goals:
614 * 721 *
615 * 1) differentiate between all holders of references to an extent so that 722 * 1) differentiate between all holders of references to an extent so that
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1871 return ret; 1978 return ret;
1872} 1979}
1873 1980
1874
1875/* helper function to actually process a single delayed ref entry */ 1981/* helper function to actually process a single delayed ref entry */
1876static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1982static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1877 struct btrfs_root *root, 1983 struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1891 BUG_ON(extent_op); 1997 BUG_ON(extent_op);
1892 head = btrfs_delayed_node_to_head(node); 1998 head = btrfs_delayed_node_to_head(node);
1893 if (insert_reserved) { 1999 if (insert_reserved) {
1894 int mark_free = 0; 2000 btrfs_pin_extent(root, node->bytenr,
1895 struct extent_buffer *must_clean = NULL; 2001 node->num_bytes, 1);
1896
1897 ret = pin_down_bytes(trans, root, NULL,
1898 node->bytenr, node->num_bytes,
1899 head->is_data, 1, &must_clean);
1900 if (ret > 0)
1901 mark_free = 1;
1902
1903 if (must_clean) {
1904 clean_tree_block(NULL, root, must_clean);
1905 btrfs_tree_unlock(must_clean);
1906 free_extent_buffer(must_clean);
1907 }
1908 if (head->is_data) { 2002 if (head->is_data) {
1909 ret = btrfs_del_csums(trans, root, 2003 ret = btrfs_del_csums(trans, root,
1910 node->bytenr, 2004 node->bytenr,
1911 node->num_bytes); 2005 node->num_bytes);
1912 BUG_ON(ret); 2006 BUG_ON(ret);
1913 } 2007 }
1914 if (mark_free) {
1915 ret = btrfs_free_reserved_extent(root,
1916 node->bytenr,
1917 node->num_bytes);
1918 BUG_ON(ret);
1919 }
1920 } 2008 }
1921 mutex_unlock(&head->mutex); 2009 mutex_unlock(&head->mutex);
1922 return 0; 2010 return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2347 ret = 0; 2435 ret = 0;
2348out: 2436out:
2349 btrfs_free_path(path); 2437 btrfs_free_path(path);
2438 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2439 WARN_ON(ret > 0);
2350 return ret; 2440 return ret;
2351} 2441}
2352 2442
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2660 struct btrfs_space_info **space_info) 2750 struct btrfs_space_info **space_info)
2661{ 2751{
2662 struct btrfs_space_info *found; 2752 struct btrfs_space_info *found;
2753 int i;
2754 int factor;
2755
2756 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2757 BTRFS_BLOCK_GROUP_RAID10))
2758 factor = 2;
2759 else
2760 factor = 1;
2663 2761
2664 found = __find_space_info(info, flags); 2762 found = __find_space_info(info, flags);
2665 if (found) { 2763 if (found) {
2666 spin_lock(&found->lock); 2764 spin_lock(&found->lock);
2667 found->total_bytes += total_bytes; 2765 found->total_bytes += total_bytes;
2668 found->bytes_used += bytes_used; 2766 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor;
2669 found->full = 0; 2768 found->full = 0;
2670 spin_unlock(&found->lock); 2769 spin_unlock(&found->lock);
2671 *space_info = found; 2770 *space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2675 if (!found) 2774 if (!found)
2676 return -ENOMEM; 2775 return -ENOMEM;
2677 2776
2678 INIT_LIST_HEAD(&found->block_groups); 2777 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2778 INIT_LIST_HEAD(&found->block_groups[i]);
2679 init_rwsem(&found->groups_sem); 2779 init_rwsem(&found->groups_sem);
2680 init_waitqueue_head(&found->flush_wait);
2681 init_waitqueue_head(&found->allocate_wait);
2682 spin_lock_init(&found->lock); 2780 spin_lock_init(&found->lock);
2683 found->flags = flags; 2781 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2782 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA);
2684 found->total_bytes = total_bytes; 2784 found->total_bytes = total_bytes;
2685 found->bytes_used = bytes_used; 2785 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor;
2686 found->bytes_pinned = 0; 2787 found->bytes_pinned = 0;
2687 found->bytes_reserved = 0; 2788 found->bytes_reserved = 0;
2688 found->bytes_readonly = 0; 2789 found->bytes_readonly = 0;
2689 found->bytes_delalloc = 0; 2790 found->bytes_may_use = 0;
2690 found->full = 0; 2791 found->full = 0;
2691 found->force_alloc = 0; 2792 found->force_alloc = 0;
2692 *space_info = found; 2793 *space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2711 } 2812 }
2712} 2813}
2713 2814
2714static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2715{
2716 spin_lock(&cache->space_info->lock);
2717 spin_lock(&cache->lock);
2718 if (!cache->ro) {
2719 cache->space_info->bytes_readonly += cache->key.offset -
2720 btrfs_block_group_used(&cache->item);
2721 cache->ro = 1;
2722 }
2723 spin_unlock(&cache->lock);
2724 spin_unlock(&cache->space_info->lock);
2725}
2726
2727u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2815u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2728{ 2816{
2729 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2817 u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2752 return flags; 2840 return flags;
2753} 2841}
2754 2842
2755static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2843static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2756{
2757 struct btrfs_fs_info *info = root->fs_info;
2758 u64 alloc_profile;
2759
2760 if (data) {
2761 alloc_profile = info->avail_data_alloc_bits &
2762 info->data_alloc_profile;
2763 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2764 } else if (root == root->fs_info->chunk_root) {
2765 alloc_profile = info->avail_system_alloc_bits &
2766 info->system_alloc_profile;
2767 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2768 } else {
2769 alloc_profile = info->avail_metadata_alloc_bits &
2770 info->metadata_alloc_profile;
2771 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2772 }
2773
2774 return btrfs_reduce_alloc_profile(root, data);
2775}
2776
2777void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2778{
2779 u64 alloc_target;
2780
2781 alloc_target = btrfs_get_alloc_profile(root, 1);
2782 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2783 alloc_target);
2784}
2785
2786static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2787{
2788 u64 num_bytes;
2789 int level;
2790
2791 level = BTRFS_MAX_LEVEL - 2;
2792 /*
2793 * NOTE: these calculations are absolutely the worst possible case.
2794 * This assumes that _every_ item we insert will require a new leaf, and
2795 * that the tree has grown to its maximum level size.
2796 */
2797
2798 /*
2799 * for every item we insert we could insert both an extent item and a
2800 * extent ref item. Then for ever item we insert, we will need to cow
2801 * both the original leaf, plus the leaf to the left and right of it.
2802 *
2803 * Unless we are talking about the extent root, then we just want the
2804 * number of items * 2, since we just need the extent item plus its ref.
2805 */
2806 if (root == root->fs_info->extent_root)
2807 num_bytes = num_items * 2;
2808 else
2809 num_bytes = (num_items + (2 * num_items)) * 3;
2810
2811 /*
2812 * num_bytes is total number of leaves we could need times the leaf
2813 * size, and then for every leaf we could end up cow'ing 2 nodes per
2814 * level, down to the leaf level.
2815 */
2816 num_bytes = (num_bytes * root->leafsize) +
2817 (num_bytes * (level * 2)) * root->nodesize;
2818
2819 return num_bytes;
2820}
2821
2822/*
2823 * Unreserve metadata space for delalloc. If we have less reserved credits than
2824 * we have extents, this function does nothing.
2825 */
2826int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2827 struct inode *inode, int num_items)
2828{
2829 struct btrfs_fs_info *info = root->fs_info;
2830 struct btrfs_space_info *meta_sinfo;
2831 u64 num_bytes;
2832 u64 alloc_target;
2833 bool bug = false;
2834
2835 /* get the space info for where the metadata will live */
2836 alloc_target = btrfs_get_alloc_profile(root, 0);
2837 meta_sinfo = __find_space_info(info, alloc_target);
2838
2839 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2840 num_items);
2841
2842 spin_lock(&meta_sinfo->lock);
2843 spin_lock(&BTRFS_I(inode)->accounting_lock);
2844 if (BTRFS_I(inode)->reserved_extents <=
2845 BTRFS_I(inode)->outstanding_extents) {
2846 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2847 spin_unlock(&meta_sinfo->lock);
2848 return 0;
2849 }
2850 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2851
2852 BTRFS_I(inode)->reserved_extents -= num_items;
2853 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2854
2855 if (meta_sinfo->bytes_delalloc < num_bytes) {
2856 bug = true;
2857 meta_sinfo->bytes_delalloc = 0;
2858 } else {
2859 meta_sinfo->bytes_delalloc -= num_bytes;
2860 }
2861 spin_unlock(&meta_sinfo->lock);
2862
2863 BUG_ON(bug);
2864
2865 return 0;
2866}
2867
2868static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2869{ 2844{
2870 u64 thresh; 2845 if (flags & BTRFS_BLOCK_GROUP_DATA)
2871 2846 flags |= root->fs_info->avail_data_alloc_bits &
2872 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2847 root->fs_info->data_alloc_profile;
2873 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2848 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2874 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 2849 flags |= root->fs_info->avail_system_alloc_bits &
2875 meta_sinfo->bytes_may_use; 2850 root->fs_info->system_alloc_profile;
2876 2851 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2877 thresh = meta_sinfo->total_bytes - thresh; 2852 flags |= root->fs_info->avail_metadata_alloc_bits &
2878 thresh *= 80; 2853 root->fs_info->metadata_alloc_profile;
2879 do_div(thresh, 100); 2854 return btrfs_reduce_alloc_profile(root, flags);
2880 if (thresh <= meta_sinfo->bytes_delalloc)
2881 meta_sinfo->force_delalloc = 1;
2882 else
2883 meta_sinfo->force_delalloc = 0;
2884} 2855}
2885 2856
2886struct async_flush { 2857static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2887 struct btrfs_root *root;
2888 struct btrfs_space_info *info;
2889 struct btrfs_work work;
2890};
2891
2892static noinline void flush_delalloc_async(struct btrfs_work *work)
2893{ 2858{
2894 struct async_flush *async; 2859 u64 flags;
2895 struct btrfs_root *root;
2896 struct btrfs_space_info *info;
2897
2898 async = container_of(work, struct async_flush, work);
2899 root = async->root;
2900 info = async->info;
2901
2902 btrfs_start_delalloc_inodes(root, 0);
2903 wake_up(&info->flush_wait);
2904 btrfs_wait_ordered_extents(root, 0, 0);
2905
2906 spin_lock(&info->lock);
2907 info->flushing = 0;
2908 spin_unlock(&info->lock);
2909 wake_up(&info->flush_wait);
2910
2911 kfree(async);
2912}
2913
2914static void wait_on_flush(struct btrfs_space_info *info)
2915{
2916 DEFINE_WAIT(wait);
2917 u64 used;
2918
2919 while (1) {
2920 prepare_to_wait(&info->flush_wait, &wait,
2921 TASK_UNINTERRUPTIBLE);
2922 spin_lock(&info->lock);
2923 if (!info->flushing) {
2924 spin_unlock(&info->lock);
2925 break;
2926 }
2927
2928 used = info->bytes_used + info->bytes_reserved +
2929 info->bytes_pinned + info->bytes_readonly +
2930 info->bytes_super + info->bytes_root +
2931 info->bytes_may_use + info->bytes_delalloc;
2932 if (used < info->total_bytes) {
2933 spin_unlock(&info->lock);
2934 break;
2935 }
2936 spin_unlock(&info->lock);
2937 schedule();
2938 }
2939 finish_wait(&info->flush_wait, &wait);
2940}
2941
2942static void flush_delalloc(struct btrfs_root *root,
2943 struct btrfs_space_info *info)
2944{
2945 struct async_flush *async;
2946 bool wait = false;
2947
2948 spin_lock(&info->lock);
2949 2860
2950 if (!info->flushing) 2861 if (data)
2951 info->flushing = 1; 2862 flags = BTRFS_BLOCK_GROUP_DATA;
2863 else if (root == root->fs_info->chunk_root)
2864 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2952 else 2865 else
2953 wait = true; 2866 flags = BTRFS_BLOCK_GROUP_METADATA;
2954
2955 spin_unlock(&info->lock);
2956
2957 if (wait) {
2958 wait_on_flush(info);
2959 return;
2960 }
2961
2962 async = kzalloc(sizeof(*async), GFP_NOFS);
2963 if (!async)
2964 goto flush;
2965
2966 async->root = root;
2967 async->info = info;
2968 async->work.func = flush_delalloc_async;
2969 2867
2970 btrfs_queue_worker(&root->fs_info->enospc_workers, 2868 return get_alloc_profile(root, flags);
2971 &async->work);
2972 wait_on_flush(info);
2973 return;
2974
2975flush:
2976 btrfs_start_delalloc_inodes(root, 0);
2977 btrfs_wait_ordered_extents(root, 0, 0);
2978
2979 spin_lock(&info->lock);
2980 info->flushing = 0;
2981 spin_unlock(&info->lock);
2982 wake_up(&info->flush_wait);
2983} 2869}
2984 2870
2985static int maybe_allocate_chunk(struct btrfs_root *root, 2871void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2986 struct btrfs_space_info *info)
2987{
2988 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2989 struct btrfs_trans_handle *trans;
2990 bool wait = false;
2991 int ret = 0;
2992 u64 min_metadata;
2993 u64 free_space;
2994
2995 free_space = btrfs_super_total_bytes(disk_super);
2996 /*
2997 * we allow the metadata to grow to a max of either 10gb or 5% of the
2998 * space in the volume.
2999 */
3000 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
3001 div64_u64(free_space * 5, 100));
3002 if (info->total_bytes >= min_metadata) {
3003 spin_unlock(&info->lock);
3004 return 0;
3005 }
3006
3007 if (info->full) {
3008 spin_unlock(&info->lock);
3009 return 0;
3010 }
3011
3012 if (!info->allocating_chunk) {
3013 info->force_alloc = 1;
3014 info->allocating_chunk = 1;
3015 } else {
3016 wait = true;
3017 }
3018
3019 spin_unlock(&info->lock);
3020
3021 if (wait) {
3022 wait_event(info->allocate_wait,
3023 !info->allocating_chunk);
3024 return 1;
3025 }
3026
3027 trans = btrfs_start_transaction(root, 1);
3028 if (!trans) {
3029 ret = -ENOMEM;
3030 goto out;
3031 }
3032
3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3034 4096 + 2 * 1024 * 1024,
3035 info->flags, 0);
3036 btrfs_end_transaction(trans, root);
3037 if (ret)
3038 goto out;
3039out:
3040 spin_lock(&info->lock);
3041 info->allocating_chunk = 0;
3042 spin_unlock(&info->lock);
3043 wake_up(&info->allocate_wait);
3044
3045 if (ret)
3046 return 0;
3047 return 1;
3048}
3049
3050/*
3051 * Reserve metadata space for delalloc.
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054 struct inode *inode, int num_items)
3055{
3056 struct btrfs_fs_info *info = root->fs_info;
3057 struct btrfs_space_info *meta_sinfo;
3058 u64 num_bytes;
3059 u64 used;
3060 u64 alloc_target;
3061 int flushed = 0;
3062 int force_delalloc;
3063
3064 /* get the space info for where the metadata will live */
3065 alloc_target = btrfs_get_alloc_profile(root, 0);
3066 meta_sinfo = __find_space_info(info, alloc_target);
3067
3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3069 num_items);
3070again:
3071 spin_lock(&meta_sinfo->lock);
3072
3073 force_delalloc = meta_sinfo->force_delalloc;
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!flushed)
3079 meta_sinfo->bytes_delalloc += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 flushed++;
3088
3089 if (flushed == 1) {
3090 if (maybe_allocate_chunk(root, meta_sinfo))
3091 goto again;
3092 flushed++;
3093 } else {
3094 spin_unlock(&meta_sinfo->lock);
3095 }
3096
3097 if (flushed == 2) {
3098 filemap_flush(inode->i_mapping);
3099 goto again;
3100 } else if (flushed == 3) {
3101 flush_delalloc(root, meta_sinfo);
3102 goto again;
3103 }
3104 spin_lock(&meta_sinfo->lock);
3105 meta_sinfo->bytes_delalloc -= num_bytes;
3106 spin_unlock(&meta_sinfo->lock);
3107 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108 BTRFS_I(inode)->outstanding_extents,
3109 BTRFS_I(inode)->reserved_extents);
3110 dump_space_info(meta_sinfo, 0, 0);
3111 return -ENOSPC;
3112 }
3113
3114 BTRFS_I(inode)->reserved_extents += num_items;
3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock);
3117
3118 if (!flushed && force_delalloc)
3119 filemap_flush(inode->i_mapping);
3120
3121 return 0;
3122}
3123
3124/*
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{
3135 struct btrfs_fs_info *info = root->fs_info;
3136 struct btrfs_space_info *meta_sinfo;
3137 u64 num_bytes;
3138 u64 alloc_target;
3139 bool bug = false;
3140
3141 /* get the space info for where the metadata will live */
3142 alloc_target = btrfs_get_alloc_profile(root, 0);
3143 meta_sinfo = __find_space_info(info, alloc_target);
3144
3145 num_bytes = calculate_bytes_needed(root, num_items);
3146
3147 spin_lock(&meta_sinfo->lock);
3148 if (meta_sinfo->bytes_may_use < num_bytes) {
3149 bug = true;
3150 meta_sinfo->bytes_may_use = 0;
3151 } else {
3152 meta_sinfo->bytes_may_use -= num_bytes;
3153 }
3154 spin_unlock(&meta_sinfo->lock);
3155
3156 BUG_ON(bug);
3157
3158 return 0;
3159}
3160
3161/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number
3163 * of bytes that would be needed to modify num_items number of items. If we
3164 * have space, fantastic, if not, you get -ENOSPC. Please call
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3175{ 2872{
3176 struct btrfs_fs_info *info = root->fs_info; 2873 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3177 struct btrfs_space_info *meta_sinfo; 2874 BTRFS_BLOCK_GROUP_DATA);
3178 u64 num_bytes;
3179 u64 used;
3180 u64 alloc_target;
3181 int retries = 0;
3182
3183 /* get the space info for where the metadata will live */
3184 alloc_target = btrfs_get_alloc_profile(root, 0);
3185 meta_sinfo = __find_space_info(info, alloc_target);
3186
3187 num_bytes = calculate_bytes_needed(root, num_items);
3188again:
3189 spin_lock(&meta_sinfo->lock);
3190
3191 if (unlikely(!meta_sinfo->bytes_root))
3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3193
3194 if (!retries)
3195 meta_sinfo->bytes_may_use += num_bytes;
3196
3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3201
3202 if (used > meta_sinfo->total_bytes) {
3203 retries++;
3204 if (retries == 1) {
3205 if (maybe_allocate_chunk(root, meta_sinfo))
3206 goto again;
3207 retries++;
3208 } else {
3209 spin_unlock(&meta_sinfo->lock);
3210 }
3211
3212 if (retries == 2) {
3213 flush_delalloc(root, meta_sinfo);
3214 goto again;
3215 }
3216 spin_lock(&meta_sinfo->lock);
3217 meta_sinfo->bytes_may_use -= num_bytes;
3218 spin_unlock(&meta_sinfo->lock);
3219
3220 dump_space_info(meta_sinfo, 0, 0);
3221 return -ENOSPC;
3222 }
3223
3224 check_force_delalloc(meta_sinfo);
3225 spin_unlock(&meta_sinfo->lock);
3226
3227 return 0;
3228} 2875}
3229 2876
3230/* 2877/*
3231 * This will check the space that the inode allocates from to make sure we have 2878 * This will check the space that the inode allocates from to make sure we have
3232 * enough space for bytes. 2879 * enough space for bytes.
3233 */ 2880 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2881int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3235 u64 bytes)
3236{ 2882{
3237 struct btrfs_space_info *data_sinfo; 2883 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root;
3238 u64 used; 2885 u64 used;
3239 int ret = 0, committed = 0, flushed = 0; 2886 int ret = 0, committed = 0;
3240 2887
3241 /* make sure bytes are sectorsize aligned */ 2888 /* make sure bytes are sectorsize aligned */
3242 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 2889 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3248again: 2895again:
3249 /* make sure we have enough space to handle the data first */ 2896 /* make sure we have enough space to handle the data first */
3250 spin_lock(&data_sinfo->lock); 2897 spin_lock(&data_sinfo->lock);
3251 used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + 2898 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3252 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + 2899 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3253 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + 2900 data_sinfo->bytes_may_use;
3254 data_sinfo->bytes_super;
3255 2901
3256 if (used + bytes > data_sinfo->total_bytes) { 2902 if (used + bytes > data_sinfo->total_bytes) {
3257 struct btrfs_trans_handle *trans; 2903 struct btrfs_trans_handle *trans;
3258 2904
3259 if (!flushed) {
3260 spin_unlock(&data_sinfo->lock);
3261 flush_delalloc(root, data_sinfo);
3262 flushed = 1;
3263 goto again;
3264 }
3265
3266 /* 2905 /*
3267 * if we don't have enough free bytes in this space then we need 2906 * if we don't have enough free bytes in this space then we need
3268 * to alloc a new chunk. 2907 * to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
3274 spin_unlock(&data_sinfo->lock); 2913 spin_unlock(&data_sinfo->lock);
3275alloc: 2914alloc:
3276 alloc_target = btrfs_get_alloc_profile(root, 1); 2915 alloc_target = btrfs_get_alloc_profile(root, 1);
3277 trans = btrfs_start_transaction(root, 1); 2916 trans = btrfs_join_transaction(root, 1);
3278 if (!trans) 2917 if (IS_ERR(trans))
3279 return -ENOMEM; 2918 return PTR_ERR(trans);
3280 2919
3281 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2920 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3282 bytes + 2 * 1024 * 1024, 2921 bytes + 2 * 1024 * 1024,
3283 alloc_target, 0); 2922 alloc_target, 0);
3284 btrfs_end_transaction(trans, root); 2923 btrfs_end_transaction(trans, root);
3285 if (ret) 2924 if (ret < 0)
3286 return ret; 2925 return ret;
3287 2926
3288 if (!data_sinfo) { 2927 if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
3297 if (!committed && !root->fs_info->open_ioctl_trans) { 2936 if (!committed && !root->fs_info->open_ioctl_trans) {
3298 committed = 1; 2937 committed = 1;
3299 trans = btrfs_join_transaction(root, 1); 2938 trans = btrfs_join_transaction(root, 1);
3300 if (!trans) 2939 if (IS_ERR(trans))
3301 return -ENOMEM; 2940 return PTR_ERR(trans);
3302 ret = btrfs_commit_transaction(trans, root); 2941 ret = btrfs_commit_transaction(trans, root);
3303 if (ret) 2942 if (ret)
3304 return ret; 2943 return ret;
3305 goto again; 2944 goto again;
3306 } 2945 }
3307 2946
3308 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2947#if 0 /* I hope we never need this code again, just in case */
3309 ", %llu bytes_used, %llu bytes_reserved, " 2948 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3310 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 2949 "%llu bytes_reserved, " "%llu bytes_pinned, "
3311 "%llu total\n", (unsigned long long)bytes, 2950 "%llu bytes_readonly, %llu may use %llu total\n",
3312 (unsigned long long)data_sinfo->bytes_delalloc, 2951 (unsigned long long)bytes,
3313 (unsigned long long)data_sinfo->bytes_used, 2952 (unsigned long long)data_sinfo->bytes_used,
3314 (unsigned long long)data_sinfo->bytes_reserved, 2953 (unsigned long long)data_sinfo->bytes_reserved,
3315 (unsigned long long)data_sinfo->bytes_pinned, 2954 (unsigned long long)data_sinfo->bytes_pinned,
3316 (unsigned long long)data_sinfo->bytes_readonly, 2955 (unsigned long long)data_sinfo->bytes_readonly,
3317 (unsigned long long)data_sinfo->bytes_may_use, 2956 (unsigned long long)data_sinfo->bytes_may_use,
3318 (unsigned long long)data_sinfo->total_bytes); 2957 (unsigned long long)data_sinfo->total_bytes);
2958#endif
3319 return -ENOSPC; 2959 return -ENOSPC;
3320 } 2960 }
3321 data_sinfo->bytes_may_use += bytes; 2961 data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
3326} 2966}
3327 2967
3328/* 2968/*
3329 * if there was an error for whatever reason after calling 2969 * called when we are clearing an delalloc extent from the
3330 * btrfs_check_data_free_space, call this so we can cleanup the counters. 2970 * inode's io_tree or there was an error for whatever reason
2971 * after calling btrfs_check_data_free_space
3331 */ 2972 */
3332void btrfs_free_reserved_data_space(struct btrfs_root *root, 2973void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3333 struct inode *inode, u64 bytes)
3334{ 2974{
2975 struct btrfs_root *root = BTRFS_I(inode)->root;
3335 struct btrfs_space_info *data_sinfo; 2976 struct btrfs_space_info *data_sinfo;
3336 2977
3337 /* make sure bytes are sectorsize aligned */ 2978 /* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
3344 spin_unlock(&data_sinfo->lock); 2985 spin_unlock(&data_sinfo->lock);
3345} 2986}
3346 2987
3347/* called when we are adding a delalloc extent to the inode's io_tree */
3348void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3349 u64 bytes)
3350{
3351 struct btrfs_space_info *data_sinfo;
3352
3353 /* get the space info for where this inode will be storing its data */
3354 data_sinfo = BTRFS_I(inode)->space_info;
3355
3356 /* make sure we have enough space to handle the data first */
3357 spin_lock(&data_sinfo->lock);
3358 data_sinfo->bytes_delalloc += bytes;
3359
3360 /*
3361 * we are adding a delalloc extent without calling
3362 * btrfs_check_data_free_space first. This happens on a weird
3363 * writepage condition, but shouldn't hurt our accounting
3364 */
3365 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3366 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3367 BTRFS_I(inode)->reserved_bytes = 0;
3368 } else {
3369 data_sinfo->bytes_may_use -= bytes;
3370 BTRFS_I(inode)->reserved_bytes -= bytes;
3371 }
3372
3373 spin_unlock(&data_sinfo->lock);
3374}
3375
3376/* called when we are clearing an delalloc extent from the inode's io_tree */
3377void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3378 u64 bytes)
3379{
3380 struct btrfs_space_info *info;
3381
3382 info = BTRFS_I(inode)->space_info;
3383
3384 spin_lock(&info->lock);
3385 info->bytes_delalloc -= bytes;
3386 spin_unlock(&info->lock);
3387}
3388
3389static void force_metadata_allocation(struct btrfs_fs_info *info) 2988static void force_metadata_allocation(struct btrfs_fs_info *info)
3390{ 2989{
3391 struct list_head *head = &info->space_info; 2990 struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3399 rcu_read_unlock(); 2998 rcu_read_unlock();
3400} 2999}
3401 3000
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3002 u64 alloc_bytes)
3003{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3005
3006 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3008 return 0;
3009
3010 if (sinfo->bytes_used + sinfo->bytes_reserved +
3011 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0;
3013
3014 return 1;
3015}
3016
3402static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3017static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3403 struct btrfs_root *extent_root, u64 alloc_bytes, 3018 struct btrfs_root *extent_root, u64 alloc_bytes,
3404 u64 flags, int force) 3019 u64 flags, int force)
3405{ 3020{
3406 struct btrfs_space_info *space_info; 3021 struct btrfs_space_info *space_info;
3407 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3022 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3408 u64 thresh;
3409 int ret = 0; 3023 int ret = 0;
3410 3024
3411 mutex_lock(&fs_info->chunk_mutex); 3025 mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3428 goto out; 3042 goto out;
3429 } 3043 }
3430 3044
3431 thresh = space_info->total_bytes - space_info->bytes_readonly; 3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3432 thresh = div_factor(thresh, 8);
3433 if (!force &&
3434 (space_info->bytes_used + space_info->bytes_pinned +
3435 space_info->bytes_reserved + alloc_bytes) < thresh) {
3436 spin_unlock(&space_info->lock); 3046 spin_unlock(&space_info->lock);
3437 goto out; 3047 goto out;
3438 } 3048 }
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3454 spin_lock(&space_info->lock); 3064 spin_lock(&space_info->lock);
3455 if (ret) 3065 if (ret)
3456 space_info->full = 1; 3066 space_info->full = 1;
3067 else
3068 ret = 1;
3457 space_info->force_alloc = 0; 3069 space_info->force_alloc = 0;
3458 spin_unlock(&space_info->lock); 3070 spin_unlock(&space_info->lock);
3459out: 3071out:
@@ -3461,13 +3073,713 @@ out:
3461 return ret; 3073 return ret;
3462} 3074}
3463 3075
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/*
3109 * shrink metadata reservation for delalloc
3110 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim)
3113{
3114 struct btrfs_block_rsv *block_rsv;
3115 u64 reserved;
3116 u64 max_reclaim;
3117 u64 reclaimed = 0;
3118 int pause = 1;
3119 int ret;
3120
3121 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock);
3123 reserved = block_rsv->reserved;
3124 spin_unlock(&block_rsv->lock);
3125
3126 if (reserved == 0)
3127 return 0;
3128
3129 max_reclaim = min(reserved, to_reclaim);
3130
3131 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3133 if (!ret) {
3134 __set_current_state(TASK_INTERRUPTIBLE);
3135 schedule_timeout(pause);
3136 pause <<= 1;
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142
3143 spin_lock(&block_rsv->lock);
3144 if (reserved > block_rsv->reserved)
3145 reclaimed = reserved - block_rsv->reserved;
3146 reserved = block_rsv->reserved;
3147 spin_unlock(&block_rsv->lock);
3148
3149 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break;
3151
3152 if (trans && trans->transaction->blocked)
3153 return -EAGAIN;
3154 }
3155 return reclaimed >= to_reclaim;
3156}
3157
3158static int should_retry_reserve(struct btrfs_trans_handle *trans,
3159 struct btrfs_root *root,
3160 struct btrfs_block_rsv *block_rsv,
3161 u64 num_bytes, int *retries)
3162{
3163 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret;
3165
3166 if ((*retries) > 2)
3167 return -ENOSPC;
3168
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3170 if (ret)
3171 return 1;
3172
3173 if (trans && trans->transaction->in_commit)
3174 return -ENOSPC;
3175
3176 ret = shrink_delalloc(trans, root, num_bytes);
3177 if (ret)
3178 return ret;
3179
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188
3189 if (trans)
3190 return -EAGAIN;
3191
3192 trans = btrfs_join_transaction(root, 1);
3193 BUG_ON(IS_ERR(trans));
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196
3197 return 1;
3198}
3199
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3201 u64 num_bytes)
3202{
3203 struct btrfs_space_info *space_info = block_rsv->space_info;
3204 u64 unused;
3205 int ret = -ENOSPC;
3206
3207 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved +
3209 space_info->bytes_pinned + space_info->bytes_readonly;
3210
3211 if (unused < space_info->total_bytes)
3212 unused = space_info->total_bytes - unused;
3213 else
3214 unused = 0;
3215
3216 if (unused >= num_bytes) {
3217 if (block_rsv->priority >= 10) {
3218 space_info->bytes_reserved += num_bytes;
3219 ret = 0;
3220 } else {
3221 if ((unused + block_rsv->reserved) *
3222 block_rsv->priority >=
3223 (num_bytes + block_rsv->reserved) * 10) {
3224 space_info->bytes_reserved += num_bytes;
3225 ret = 0;
3226 }
3227 }
3228 }
3229 spin_unlock(&space_info->lock);
3230
3231 return ret;
3232}
3233
3234static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3235 struct btrfs_root *root)
3236{
3237 struct btrfs_block_rsv *block_rsv;
3238 if (root->ref_cows)
3239 block_rsv = trans->block_rsv;
3240 else
3241 block_rsv = root->block_rsv;
3242
3243 if (!block_rsv)
3244 block_rsv = &root->fs_info->empty_block_rsv;
3245
3246 return block_rsv;
3247}
3248
3249static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3250 u64 num_bytes)
3251{
3252 int ret = -ENOSPC;
3253 spin_lock(&block_rsv->lock);
3254 if (block_rsv->reserved >= num_bytes) {
3255 block_rsv->reserved -= num_bytes;
3256 if (block_rsv->reserved < block_rsv->size)
3257 block_rsv->full = 0;
3258 ret = 0;
3259 }
3260 spin_unlock(&block_rsv->lock);
3261 return ret;
3262}
3263
3264static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3265 u64 num_bytes, int update_size)
3266{
3267 spin_lock(&block_rsv->lock);
3268 block_rsv->reserved += num_bytes;
3269 if (update_size)
3270 block_rsv->size += num_bytes;
3271 else if (block_rsv->reserved >= block_rsv->size)
3272 block_rsv->full = 1;
3273 spin_unlock(&block_rsv->lock);
3274}
3275
3276void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3277 struct btrfs_block_rsv *dest, u64 num_bytes)
3278{
3279 struct btrfs_space_info *space_info = block_rsv->space_info;
3280
3281 spin_lock(&block_rsv->lock);
3282 if (num_bytes == (u64)-1)
3283 num_bytes = block_rsv->size;
3284 block_rsv->size -= num_bytes;
3285 if (block_rsv->reserved >= block_rsv->size) {
3286 num_bytes = block_rsv->reserved - block_rsv->size;
3287 block_rsv->reserved = block_rsv->size;
3288 block_rsv->full = 1;
3289 } else {
3290 num_bytes = 0;
3291 }
3292 spin_unlock(&block_rsv->lock);
3293
3294 if (num_bytes > 0) {
3295 if (dest) {
3296 block_rsv_add_bytes(dest, num_bytes, 0);
3297 } else {
3298 spin_lock(&space_info->lock);
3299 space_info->bytes_reserved -= num_bytes;
3300 spin_unlock(&space_info->lock);
3301 }
3302 }
3303}
3304
3305static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3306 struct btrfs_block_rsv *dst, u64 num_bytes)
3307{
3308 int ret;
3309
3310 ret = block_rsv_use_bytes(src, num_bytes);
3311 if (ret)
3312 return ret;
3313
3314 block_rsv_add_bytes(dst, num_bytes, 1);
3315 return 0;
3316}
3317
3318void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3319{
3320 memset(rsv, 0, sizeof(*rsv));
3321 spin_lock_init(&rsv->lock);
3322 atomic_set(&rsv->usage, 1);
3323 rsv->priority = 6;
3324 INIT_LIST_HEAD(&rsv->list);
3325}
3326
3327struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{
3329 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv)
3335 return NULL;
3336
3337 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv;
3344}
3345
3346void btrfs_free_block_rsv(struct btrfs_root *root,
3347 struct btrfs_block_rsv *rsv)
3348{
3349 if (rsv && atomic_dec_and_test(&rsv->usage)) {
3350 btrfs_block_rsv_release(root, rsv, (u64)-1);
3351 if (!rsv->durable)
3352 kfree(rsv);
3353 }
3354}
3355
3356/*
3357 * make the block_rsv struct be able to capture freed space.
3358 * the captured space will re-add to the the block_rsv struct
3359 * after transaction commit
3360 */
3361void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3362 struct btrfs_block_rsv *block_rsv)
3363{
3364 block_rsv->durable = 1;
3365 mutex_lock(&fs_info->durable_block_rsv_mutex);
3366 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3367 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3368}
3369
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries)
3374{
3375 int ret;
3376
3377 if (num_bytes == 0)
3378 return 0;
3379again:
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3381 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0;
3384 }
3385
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret;
3391}
3392
3393int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3394 struct btrfs_root *root,
3395 struct btrfs_block_rsv *block_rsv,
3396 u64 min_reserved, int min_factor)
3397{
3398 u64 num_bytes = 0;
3399 int commit_trans = 0;
3400 int ret = -ENOSPC;
3401
3402 if (!block_rsv)
3403 return 0;
3404
3405 spin_lock(&block_rsv->lock);
3406 if (min_factor > 0)
3407 num_bytes = div_factor(block_rsv->size, min_factor);
3408 if (min_reserved > num_bytes)
3409 num_bytes = min_reserved;
3410
3411 if (block_rsv->reserved >= num_bytes) {
3412 ret = 0;
3413 } else {
3414 num_bytes -= block_rsv->reserved;
3415 if (block_rsv->durable &&
3416 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3417 commit_trans = 1;
3418 }
3419 spin_unlock(&block_rsv->lock);
3420 if (!ret)
3421 return 0;
3422
3423 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3425 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0;
3428 }
3429 }
3430
3431 if (commit_trans) {
3432 if (trans)
3433 return -EAGAIN;
3434
3435 trans = btrfs_join_transaction(root, 1);
3436 BUG_ON(IS_ERR(trans));
3437 ret = btrfs_commit_transaction(trans, root);
3438 return 0;
3439 }
3440
3441 WARN_ON(1);
3442 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3443 block_rsv->size, block_rsv->reserved,
3444 block_rsv->freed[0], block_rsv->freed[1]);
3445
3446 return -ENOSPC;
3447}
3448
3449int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3450 struct btrfs_block_rsv *dst_rsv,
3451 u64 num_bytes)
3452{
3453 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3454}
3455
3456void btrfs_block_rsv_release(struct btrfs_root *root,
3457 struct btrfs_block_rsv *block_rsv,
3458 u64 num_bytes)
3459{
3460 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3461 if (global_rsv->full || global_rsv == block_rsv ||
3462 block_rsv->space_info != global_rsv->space_info)
3463 global_rsv = NULL;
3464 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3465}
3466
3467/*
3468 * helper to calculate size of global block reservation.
3469 * the desired value is sum of space used by extent tree,
3470 * checksum tree and root tree
3471 */
3472static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3473{
3474 struct btrfs_space_info *sinfo;
3475 u64 num_bytes;
3476 u64 meta_used;
3477 u64 data_used;
3478 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3479#if 0
3480 /*
3481 * per tree used space accounting can be inaccuracy, so we
3482 * can't rely on it.
3483 */
3484 spin_lock(&fs_info->extent_root->accounting_lock);
3485 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3486 spin_unlock(&fs_info->extent_root->accounting_lock);
3487
3488 spin_lock(&fs_info->csum_root->accounting_lock);
3489 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3490 spin_unlock(&fs_info->csum_root->accounting_lock);
3491
3492 spin_lock(&fs_info->tree_root->accounting_lock);
3493 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3494 spin_unlock(&fs_info->tree_root->accounting_lock);
3495#endif
3496 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3497 spin_lock(&sinfo->lock);
3498 data_used = sinfo->bytes_used;
3499 spin_unlock(&sinfo->lock);
3500
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock);
3503 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock);
3505
3506 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3507 csum_size * 2;
3508 num_bytes += div64_u64(data_used + meta_used, 50);
3509
3510 if (num_bytes * 3 > meta_used)
3511 num_bytes = div64_u64(meta_used, 3);
3512
3513 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3514}
3515
3516static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3517{
3518 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3519 struct btrfs_space_info *sinfo = block_rsv->space_info;
3520 u64 num_bytes;
3521
3522 num_bytes = calc_global_metadata_size(fs_info);
3523
3524 spin_lock(&block_rsv->lock);
3525 spin_lock(&sinfo->lock);
3526
3527 block_rsv->size = num_bytes;
3528
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly;
3531
3532 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes;
3534 block_rsv->reserved += num_bytes;
3535 sinfo->bytes_reserved += num_bytes;
3536 }
3537
3538 if (block_rsv->reserved >= block_rsv->size) {
3539 num_bytes = block_rsv->reserved - block_rsv->size;
3540 sinfo->bytes_reserved -= num_bytes;
3541 block_rsv->reserved = block_rsv->size;
3542 block_rsv->full = 1;
3543 }
3544#if 0
3545 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3546 block_rsv->size, block_rsv->reserved);
3547#endif
3548 spin_unlock(&sinfo->lock);
3549 spin_unlock(&block_rsv->lock);
3550}
3551
3552static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3553{
3554 struct btrfs_space_info *space_info;
3555
3556 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3557 fs_info->chunk_block_rsv.space_info = space_info;
3558 fs_info->chunk_block_rsv.priority = 10;
3559
3560 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3561 fs_info->global_block_rsv.space_info = space_info;
3562 fs_info->global_block_rsv.priority = 10;
3563 fs_info->global_block_rsv.refill_used = 1;
3564 fs_info->delalloc_block_rsv.space_info = space_info;
3565 fs_info->trans_block_rsv.space_info = space_info;
3566 fs_info->empty_block_rsv.space_info = space_info;
3567 fs_info->empty_block_rsv.priority = 10;
3568
3569 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3570 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3571 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3572 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3573 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3574
3575 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3576
3577 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3578
3579 update_global_block_rsv(fs_info);
3580}
3581
3582static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3583{
3584 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3585 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3586 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3587 WARN_ON(fs_info->trans_block_rsv.size > 0);
3588 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3589 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3590 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3591}
3592
3593static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3594{
3595 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3596 3 * num_items;
3597}
3598
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root,
3601 int num_items, int *retries)
3602{
3603 u64 num_bytes;
3604 int ret;
3605
3606 if (num_items == 0 || root->fs_info->chunk_root == root)
3607 return 0;
3608
3609 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries);
3612 if (!ret) {
3613 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv;
3615 }
3616 return ret;
3617}
3618
3619void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3620 struct btrfs_root *root)
3621{
3622 if (!trans->bytes_reserved)
3623 return;
3624
3625 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3626 btrfs_block_rsv_release(root, trans->block_rsv,
3627 trans->bytes_reserved);
3628 trans->bytes_reserved = 0;
3629}
3630
3631int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3632 struct inode *inode)
3633{
3634 struct btrfs_root *root = BTRFS_I(inode)->root;
3635 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3636 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3637
3638 /*
3639 * one for deleting orphan item, one for updating inode and
3640 * two for calling btrfs_truncate_inode_items.
3641 *
3642 * btrfs_truncate_inode_items is a delete operation, it frees
3643 * more space than it uses in most cases. So two units of
3644 * metadata space should be enough for calling it many times.
3645 * If all of the metadata space is used, we can commit
3646 * transaction and use space it freed.
3647 */
3648 u64 num_bytes = calc_trans_metadata_size(root, 4);
3649 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3650}
3651
3652void btrfs_orphan_release_metadata(struct inode *inode)
3653{
3654 struct btrfs_root *root = BTRFS_I(inode)->root;
3655 u64 num_bytes = calc_trans_metadata_size(root, 4);
3656 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3657}
3658
3659int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3660 struct btrfs_pending_snapshot *pending)
3661{
3662 struct btrfs_root *root = pending->root;
3663 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3664 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3665 /*
3666 * two for root back/forward refs, two for directory entries
3667 * and one for root of the snapshot.
3668 */
3669 u64 num_bytes = calc_trans_metadata_size(root, 5);
3670 dst_rsv->space_info = src_rsv->space_info;
3671 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3672}
3673
3674static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3675{
3676 return num_bytes >>= 3;
3677}
3678
3679int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3680{
3681 struct btrfs_root *root = BTRFS_I(inode)->root;
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve;
3684 int nr_extents;
3685 int retries = 0;
3686 int ret;
3687
3688 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1);
3690
3691 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again:
3693 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3696 nr_extents -= BTRFS_I(inode)->reserved_extents;
3697 to_reserve = calc_trans_metadata_size(root, nr_extents);
3698 } else {
3699 nr_extents = 0;
3700 to_reserve = 0;
3701 }
3702
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve);
3705 if (ret) {
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret;
3712 }
3713
3714 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3717
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719
3720 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve);
3722
3723 return 0;
3724}
3725
3726void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3727{
3728 struct btrfs_root *root = BTRFS_I(inode)->root;
3729 u64 to_free;
3730 int nr_extents;
3731
3732 num_bytes = ALIGN(num_bytes, root->sectorsize);
3733 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3734
3735 spin_lock(&BTRFS_I(inode)->accounting_lock);
3736 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3737 if (nr_extents < BTRFS_I(inode)->reserved_extents) {
3738 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
3739 BTRFS_I(inode)->reserved_extents -= nr_extents;
3740 } else {
3741 nr_extents = 0;
3742 }
3743 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3744
3745 to_free = calc_csum_metadata_size(inode, num_bytes);
3746 if (nr_extents > 0)
3747 to_free += calc_trans_metadata_size(root, nr_extents);
3748
3749 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3750 to_free);
3751}
3752
3753int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
3754{
3755 int ret;
3756
3757 ret = btrfs_check_data_free_space(inode, num_bytes);
3758 if (ret)
3759 return ret;
3760
3761 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3762 if (ret) {
3763 btrfs_free_reserved_data_space(inode, num_bytes);
3764 return ret;
3765 }
3766
3767 return 0;
3768}
3769
3770void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
3771{
3772 btrfs_delalloc_release_metadata(inode, num_bytes);
3773 btrfs_free_reserved_data_space(inode, num_bytes);
3774}
3775
3464static int update_block_group(struct btrfs_trans_handle *trans, 3776static int update_block_group(struct btrfs_trans_handle *trans,
3465 struct btrfs_root *root, 3777 struct btrfs_root *root,
3466 u64 bytenr, u64 num_bytes, int alloc, 3778 u64 bytenr, u64 num_bytes, int alloc)
3467 int mark_free)
3468{ 3779{
3469 struct btrfs_block_group_cache *cache; 3780 struct btrfs_block_group_cache *cache;
3470 struct btrfs_fs_info *info = root->fs_info; 3781 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3471 u64 total = num_bytes; 3783 u64 total = num_bytes;
3472 u64 old_val; 3784 u64 old_val;
3473 u64 byte_in_group; 3785 u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3486 cache = btrfs_lookup_block_group(info, bytenr); 3798 cache = btrfs_lookup_block_group(info, bytenr);
3487 if (!cache) 3799 if (!cache)
3488 return -1; 3800 return -1;
3801 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3802 BTRFS_BLOCK_GROUP_RAID1 |
3803 BTRFS_BLOCK_GROUP_RAID10))
3804 factor = 2;
3805 else
3806 factor = 1;
3489 byte_in_group = bytenr - cache->key.objectid; 3807 byte_in_group = bytenr - cache->key.objectid;
3490 WARN_ON(byte_in_group > cache->key.offset); 3808 WARN_ON(byte_in_group > cache->key.offset);
3491 3809
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3498 old_val += num_bytes; 3816 old_val += num_bytes;
3499 btrfs_set_block_group_used(&cache->item, old_val); 3817 btrfs_set_block_group_used(&cache->item, old_val);
3500 cache->reserved -= num_bytes; 3818 cache->reserved -= num_bytes;
3501 cache->space_info->bytes_used += num_bytes;
3502 cache->space_info->bytes_reserved -= num_bytes; 3819 cache->space_info->bytes_reserved -= num_bytes;
3503 if (cache->ro) 3820 cache->space_info->bytes_used += num_bytes;
3504 cache->space_info->bytes_readonly -= num_bytes; 3821 cache->space_info->disk_used += num_bytes * factor;
3505 spin_unlock(&cache->lock); 3822 spin_unlock(&cache->lock);
3506 spin_unlock(&cache->space_info->lock); 3823 spin_unlock(&cache->space_info->lock);
3507 } else { 3824 } else {
3508 old_val -= num_bytes; 3825 old_val -= num_bytes;
3509 cache->space_info->bytes_used -= num_bytes;
3510 if (cache->ro)
3511 cache->space_info->bytes_readonly += num_bytes;
3512 btrfs_set_block_group_used(&cache->item, old_val); 3826 btrfs_set_block_group_used(&cache->item, old_val);
3827 cache->pinned += num_bytes;
3828 cache->space_info->bytes_pinned += num_bytes;
3829 cache->space_info->bytes_used -= num_bytes;
3830 cache->space_info->disk_used -= num_bytes * factor;
3513 spin_unlock(&cache->lock); 3831 spin_unlock(&cache->lock);
3514 spin_unlock(&cache->space_info->lock); 3832 spin_unlock(&cache->space_info->lock);
3515 if (mark_free) {
3516 int ret;
3517 3833
3518 ret = btrfs_discard_extent(root, bytenr, 3834 set_extent_dirty(info->pinned_extents,
3519 num_bytes); 3835 bytenr, bytenr + num_bytes - 1,
3520 WARN_ON(ret); 3836 GFP_NOFS | __GFP_NOFAIL);
3521
3522 ret = btrfs_add_free_space(cache, bytenr,
3523 num_bytes);
3524 WARN_ON(ret);
3525 }
3526 } 3837 }
3527 btrfs_put_block_group(cache); 3838 btrfs_put_block_group(cache);
3528 total -= num_bytes; 3839 total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3546 return bytenr; 3857 return bytenr;
3547} 3858}
3548 3859
3549/* 3860static int pin_down_extent(struct btrfs_root *root,
3550 * this function must be called within transaction 3861 struct btrfs_block_group_cache *cache,
3551 */ 3862 u64 bytenr, u64 num_bytes, int reserved)
3552int btrfs_pin_extent(struct btrfs_root *root,
3553 u64 bytenr, u64 num_bytes, int reserved)
3554{ 3863{
3555 struct btrfs_fs_info *fs_info = root->fs_info;
3556 struct btrfs_block_group_cache *cache;
3557
3558 cache = btrfs_lookup_block_group(fs_info, bytenr);
3559 BUG_ON(!cache);
3560
3561 spin_lock(&cache->space_info->lock); 3864 spin_lock(&cache->space_info->lock);
3562 spin_lock(&cache->lock); 3865 spin_lock(&cache->lock);
3563 cache->pinned += num_bytes; 3866 cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
3569 spin_unlock(&cache->lock); 3872 spin_unlock(&cache->lock);
3570 spin_unlock(&cache->space_info->lock); 3873 spin_unlock(&cache->space_info->lock);
3571 3874
3572 btrfs_put_block_group(cache); 3875 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3876 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3877 return 0;
3878}
3879
3880/*
3881 * this function must be called within transaction
3882 */
3883int btrfs_pin_extent(struct btrfs_root *root,
3884 u64 bytenr, u64 num_bytes, int reserved)
3885{
3886 struct btrfs_block_group_cache *cache;
3887
3888 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3889 BUG_ON(!cache);
3890
3891 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3573 3892
3574 set_extent_dirty(fs_info->pinned_extents, 3893 btrfs_put_block_group(cache);
3575 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3576 return 0; 3894 return 0;
3577} 3895}
3578 3896
3579static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3897/*
3580 u64 num_bytes, int reserve) 3898 * update size of reserved extents. this function may return -EAGAIN
3899 * if 'reserve' is true or 'sinfo' is false.
3900 */
3901static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3902 u64 num_bytes, int reserve, int sinfo)
3581{ 3903{
3582 spin_lock(&cache->space_info->lock); 3904 int ret = 0;
3583 spin_lock(&cache->lock); 3905 if (sinfo) {
3584 if (reserve) { 3906 struct btrfs_space_info *space_info = cache->space_info;
3585 cache->reserved += num_bytes; 3907 spin_lock(&space_info->lock);
3586 cache->space_info->bytes_reserved += num_bytes; 3908 spin_lock(&cache->lock);
3909 if (reserve) {
3910 if (cache->ro) {
3911 ret = -EAGAIN;
3912 } else {
3913 cache->reserved += num_bytes;
3914 space_info->bytes_reserved += num_bytes;
3915 }
3916 } else {
3917 if (cache->ro)
3918 space_info->bytes_readonly += num_bytes;
3919 cache->reserved -= num_bytes;
3920 space_info->bytes_reserved -= num_bytes;
3921 }
3922 spin_unlock(&cache->lock);
3923 spin_unlock(&space_info->lock);
3587 } else { 3924 } else {
3588 cache->reserved -= num_bytes; 3925 spin_lock(&cache->lock);
3589 cache->space_info->bytes_reserved -= num_bytes; 3926 if (cache->ro) {
3927 ret = -EAGAIN;
3928 } else {
3929 if (reserve)
3930 cache->reserved += num_bytes;
3931 else
3932 cache->reserved -= num_bytes;
3933 }
3934 spin_unlock(&cache->lock);
3590 } 3935 }
3591 spin_unlock(&cache->lock); 3936 return ret;
3592 spin_unlock(&cache->space_info->lock);
3593 return 0;
3594} 3937}
3595 3938
3596int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3939int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3621 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3964 fs_info->pinned_extents = &fs_info->freed_extents[0];
3622 3965
3623 up_write(&fs_info->extent_commit_sem); 3966 up_write(&fs_info->extent_commit_sem);
3967
3968 update_global_block_rsv(fs_info);
3624 return 0; 3969 return 0;
3625} 3970}
3626 3971
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3647 btrfs_add_free_space(cache, start, len); 3992 btrfs_add_free_space(cache, start, len);
3648 } 3993 }
3649 3994
3995 start += len;
3996
3650 spin_lock(&cache->space_info->lock); 3997 spin_lock(&cache->space_info->lock);
3651 spin_lock(&cache->lock); 3998 spin_lock(&cache->lock);
3652 cache->pinned -= len; 3999 cache->pinned -= len;
3653 cache->space_info->bytes_pinned -= len; 4000 cache->space_info->bytes_pinned -= len;
4001 if (cache->ro) {
4002 cache->space_info->bytes_readonly += len;
4003 } else if (cache->reserved_pinned > 0) {
4004 len = min(len, cache->reserved_pinned);
4005 cache->reserved_pinned -= len;
4006 cache->space_info->bytes_reserved += len;
4007 }
3654 spin_unlock(&cache->lock); 4008 spin_unlock(&cache->lock);
3655 spin_unlock(&cache->space_info->lock); 4009 spin_unlock(&cache->space_info->lock);
3656
3657 start += len;
3658 } 4010 }
3659 4011
3660 if (cache) 4012 if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3667{ 4019{
3668 struct btrfs_fs_info *fs_info = root->fs_info; 4020 struct btrfs_fs_info *fs_info = root->fs_info;
3669 struct extent_io_tree *unpin; 4021 struct extent_io_tree *unpin;
4022 struct btrfs_block_rsv *block_rsv;
4023 struct btrfs_block_rsv *next_rsv;
3670 u64 start; 4024 u64 start;
3671 u64 end; 4025 u64 end;
4026 int idx;
3672 int ret; 4027 int ret;
3673 4028
3674 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4029 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3689 cond_resched(); 4044 cond_resched();
3690 } 4045 }
3691 4046
3692 return ret; 4047 mutex_lock(&fs_info->durable_block_rsv_mutex);
3693} 4048 list_for_each_entry_safe(block_rsv, next_rsv,
4049 &fs_info->durable_block_rsv_list, list) {
3694 4050
3695static int pin_down_bytes(struct btrfs_trans_handle *trans, 4051 idx = trans->transid & 0x1;
3696 struct btrfs_root *root, 4052 if (block_rsv->freed[idx] > 0) {
3697 struct btrfs_path *path, 4053 block_rsv_add_bytes(block_rsv,
3698 u64 bytenr, u64 num_bytes, 4054 block_rsv->freed[idx], 0);
3699 int is_data, int reserved, 4055 block_rsv->freed[idx] = 0;
3700 struct extent_buffer **must_clean) 4056 }
3701{ 4057 if (atomic_read(&block_rsv->usage) == 0) {
3702 int err = 0; 4058 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3703 struct extent_buffer *buf;
3704
3705 if (is_data)
3706 goto pinit;
3707
3708 /*
3709 * discard is sloooow, and so triggering discards on
3710 * individual btree blocks isn't a good plan. Just
3711 * pin everything in discard mode.
3712 */
3713 if (btrfs_test_opt(root, DISCARD))
3714 goto pinit;
3715
3716 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3717 if (!buf)
3718 goto pinit;
3719 4059
3720 /* we can reuse a block if it hasn't been written 4060 if (block_rsv->freed[0] == 0 &&
3721 * and it is from this transaction. We can't 4061 block_rsv->freed[1] == 0) {
3722 * reuse anything from the tree log root because 4062 list_del_init(&block_rsv->list);
3723 * it has tiny sub-transactions. 4063 kfree(block_rsv);
3724 */ 4064 }
3725 if (btrfs_buffer_uptodate(buf, 0) && 4065 } else {
3726 btrfs_try_tree_lock(buf)) { 4066 btrfs_block_rsv_release(root, block_rsv, 0);
3727 u64 header_owner = btrfs_header_owner(buf);
3728 u64 header_transid = btrfs_header_generation(buf);
3729 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3730 header_transid == trans->transid &&
3731 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3732 *must_clean = buf;
3733 return 1;
3734 } 4067 }
3735 btrfs_tree_unlock(buf);
3736 } 4068 }
3737 free_extent_buffer(buf); 4069 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3738pinit:
3739 if (path)
3740 btrfs_set_path_blocking(path);
3741 /* unlocks the pinned mutex */
3742 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3743 4070
3744 BUG_ON(err < 0);
3745 return 0; 4071 return 0;
3746} 4072}
3747 4073
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3902 BUG_ON(ret); 4228 BUG_ON(ret);
3903 } 4229 }
3904 } else { 4230 } else {
3905 int mark_free = 0;
3906 struct extent_buffer *must_clean = NULL;
3907
3908 if (found_extent) { 4231 if (found_extent) {
3909 BUG_ON(is_data && refs_to_drop != 4232 BUG_ON(is_data && refs_to_drop !=
3910 extent_data_ref_count(root, path, iref)); 4233 extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3917 } 4240 }
3918 } 4241 }
3919 4242
3920 ret = pin_down_bytes(trans, root, path, bytenr,
3921 num_bytes, is_data, 0, &must_clean);
3922 if (ret > 0)
3923 mark_free = 1;
3924 BUG_ON(ret < 0);
3925 /*
3926 * it is going to be very rare for someone to be waiting
3927 * on the block we're freeing. del_items might need to
3928 * schedule, so rather than get fancy, just force it
3929 * to blocking here
3930 */
3931 if (must_clean)
3932 btrfs_set_lock_blocking(must_clean);
3933
3934 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4243 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3935 num_to_del); 4244 num_to_del);
3936 BUG_ON(ret); 4245 BUG_ON(ret);
3937 btrfs_release_path(extent_root, path); 4246 btrfs_release_path(extent_root, path);
3938 4247
3939 if (must_clean) {
3940 clean_tree_block(NULL, root, must_clean);
3941 btrfs_tree_unlock(must_clean);
3942 free_extent_buffer(must_clean);
3943 }
3944
3945 if (is_data) { 4248 if (is_data) {
3946 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4249 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3947 BUG_ON(ret); 4250 BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3951 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4254 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3952 } 4255 }
3953 4256
3954 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 4257 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
3955 mark_free);
3956 BUG_ON(ret); 4258 BUG_ON(ret);
3957 } 4259 }
3958 btrfs_free_path(path); 4260 btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3960} 4262}
3961 4263
3962/* 4264/*
3963 * when we free an extent, it is possible (and likely) that we free the last 4265 * when we free an block, it is possible (and likely) that we free the last
3964 * delayed ref for that extent as well. This searches the delayed ref tree for 4266 * delayed ref for that extent as well. This searches the delayed ref tree for
3965 * a given extent, and if there are no other delayed refs to be processed, it 4267 * a given extent, and if there are no other delayed refs to be processed, it
3966 * removes it from the tree. 4268 * removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3972 struct btrfs_delayed_ref_root *delayed_refs; 4274 struct btrfs_delayed_ref_root *delayed_refs;
3973 struct btrfs_delayed_ref_node *ref; 4275 struct btrfs_delayed_ref_node *ref;
3974 struct rb_node *node; 4276 struct rb_node *node;
3975 int ret; 4277 int ret = 0;
3976 4278
3977 delayed_refs = &trans->transaction->delayed_refs; 4279 delayed_refs = &trans->transaction->delayed_refs;
3978 spin_lock(&delayed_refs->lock); 4280 spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,99 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4024 list_del_init(&head->cluster); 4326 list_del_init(&head->cluster);
4025 spin_unlock(&delayed_refs->lock); 4327 spin_unlock(&delayed_refs->lock);
4026 4328
4027 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4329 BUG_ON(head->extent_op);
4028 &head->node, head->extent_op, 4330 if (head->must_insert_reserved)
4029 head->must_insert_reserved); 4331 ret = 1;
4030 BUG_ON(ret); 4332
4333 mutex_unlock(&head->mutex);
4031 btrfs_put_delayed_ref(&head->node); 4334 btrfs_put_delayed_ref(&head->node);
4032 return 0; 4335 return ret;
4033out: 4336out:
4034 spin_unlock(&delayed_refs->lock); 4337 spin_unlock(&delayed_refs->lock);
4035 return 0; 4338 return 0;
4036} 4339}
4037 4340
4341void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4342 struct btrfs_root *root,
4343 struct extent_buffer *buf,
4344 u64 parent, int last_ref)
4345{
4346 struct btrfs_block_rsv *block_rsv;
4347 struct btrfs_block_group_cache *cache = NULL;
4348 int ret;
4349
4350 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4351 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4352 parent, root->root_key.objectid,
4353 btrfs_header_level(buf),
4354 BTRFS_DROP_DELAYED_REF, NULL);
4355 BUG_ON(ret);
4356 }
4357
4358 if (!last_ref)
4359 return;
4360
4361 block_rsv = get_block_rsv(trans, root);
4362 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4363 BUG_ON(block_rsv->space_info != cache->space_info);
4364
4365 if (btrfs_header_generation(buf) == trans->transid) {
4366 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4367 ret = check_ref_cleanup(trans, root, buf->start);
4368 if (!ret)
4369 goto pin;
4370 }
4371
4372 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4373 pin_down_extent(root, cache, buf->start, buf->len, 1);
4374 goto pin;
4375 }
4376
4377 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4378
4379 btrfs_add_free_space(cache, buf->start, buf->len);
4380 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4381 if (ret == -EAGAIN) {
4382 /* block group became read-only */
4383 update_reserved_bytes(cache, buf->len, 0, 1);
4384 goto out;
4385 }
4386
4387 ret = 1;
4388 spin_lock(&block_rsv->lock);
4389 if (block_rsv->reserved < block_rsv->size) {
4390 block_rsv->reserved += buf->len;
4391 ret = 0;
4392 }
4393 spin_unlock(&block_rsv->lock);
4394
4395 if (ret) {
4396 spin_lock(&cache->space_info->lock);
4397 cache->space_info->bytes_reserved -= buf->len;
4398 spin_unlock(&cache->space_info->lock);
4399 }
4400 goto out;
4401 }
4402pin:
4403 if (block_rsv->durable && !cache->ro) {
4404 ret = 0;
4405 spin_lock(&cache->lock);
4406 if (!cache->ro) {
4407 cache->reserved_pinned += buf->len;
4408 ret = 1;
4409 }
4410 spin_unlock(&cache->lock);
4411
4412 if (ret) {
4413 spin_lock(&block_rsv->lock);
4414 block_rsv->freed[trans->transid & 0x1] += buf->len;
4415 spin_unlock(&block_rsv->lock);
4416 }
4417 }
4418out:
4419 btrfs_put_block_group(cache);
4420}
4421
4038int btrfs_free_extent(struct btrfs_trans_handle *trans, 4422int btrfs_free_extent(struct btrfs_trans_handle *trans,
4039 struct btrfs_root *root, 4423 struct btrfs_root *root,
4040 u64 bytenr, u64 num_bytes, u64 parent, 4424 u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4440,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4056 parent, root_objectid, (int)owner, 4440 parent, root_objectid, (int)owner,
4057 BTRFS_DROP_DELAYED_REF, NULL); 4441 BTRFS_DROP_DELAYED_REF, NULL);
4058 BUG_ON(ret); 4442 BUG_ON(ret);
4059 ret = check_ref_cleanup(trans, root, bytenr);
4060 BUG_ON(ret);
4061 } else { 4443 } else {
4062 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4444 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4063 parent, root_objectid, owner, 4445 parent, root_objectid, owner,
@@ -4067,21 +4449,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4067 return ret; 4449 return ret;
4068} 4450}
4069 4451
4070int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4071 struct btrfs_root *root,
4072 u64 bytenr, u32 blocksize,
4073 u64 parent, u64 root_objectid, int level)
4074{
4075 u64 used;
4076 spin_lock(&root->node_lock);
4077 used = btrfs_root_used(&root->root_item) - blocksize;
4078 btrfs_set_root_used(&root->root_item, used);
4079 spin_unlock(&root->node_lock);
4080
4081 return btrfs_free_extent(trans, root, bytenr, blocksize,
4082 parent, root_objectid, level, 0);
4083}
4084
4085static u64 stripe_align(struct btrfs_root *root, u64 val) 4452static u64 stripe_align(struct btrfs_root *root, u64 val)
4086{ 4453{
4087 u64 mask = ((u64)root->stripesize - 1); 4454 u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4501,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4134 return 0; 4501 return 0;
4135} 4502}
4136 4503
4504static int get_block_group_index(struct btrfs_block_group_cache *cache)
4505{
4506 int index;
4507 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4508 index = 0;
4509 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4510 index = 1;
4511 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4512 index = 2;
4513 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4514 index = 3;
4515 else
4516 index = 4;
4517 return index;
4518}
4519
4137enum btrfs_loop_type { 4520enum btrfs_loop_type {
4138 LOOP_FIND_IDEAL = 0, 4521 LOOP_FIND_IDEAL = 0,
4139 LOOP_CACHING_NOWAIT = 1, 4522 LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4538,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4155 u64 num_bytes, u64 empty_size, 4538 u64 num_bytes, u64 empty_size,
4156 u64 search_start, u64 search_end, 4539 u64 search_start, u64 search_end,
4157 u64 hint_byte, struct btrfs_key *ins, 4540 u64 hint_byte, struct btrfs_key *ins,
4158 u64 exclude_start, u64 exclude_nr,
4159 int data) 4541 int data)
4160{ 4542{
4161 int ret = 0; 4543 int ret = 0;
@@ -4168,6 +4550,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4168 struct btrfs_space_info *space_info; 4550 struct btrfs_space_info *space_info;
4169 int last_ptr_loop = 0; 4551 int last_ptr_loop = 0;
4170 int loop = 0; 4552 int loop = 0;
4553 int index = 0;
4171 bool found_uncached_bg = false; 4554 bool found_uncached_bg = false;
4172 bool failed_cluster_refill = false; 4555 bool failed_cluster_refill = false;
4173 bool failed_alloc = false; 4556 bool failed_alloc = false;
@@ -4237,6 +4620,7 @@ ideal_cache:
4237 btrfs_put_block_group(block_group); 4620 btrfs_put_block_group(block_group);
4238 up_read(&space_info->groups_sem); 4621 up_read(&space_info->groups_sem);
4239 } else { 4622 } else {
4623 index = get_block_group_index(block_group);
4240 goto have_block_group; 4624 goto have_block_group;
4241 } 4625 }
4242 } else if (block_group) { 4626 } else if (block_group) {
@@ -4245,7 +4629,8 @@ ideal_cache:
4245 } 4629 }
4246search: 4630search:
4247 down_read(&space_info->groups_sem); 4631 down_read(&space_info->groups_sem);
4248 list_for_each_entry(block_group, &space_info->block_groups, list) { 4632 list_for_each_entry(block_group, &space_info->block_groups[index],
4633 list) {
4249 u64 offset; 4634 u64 offset;
4250 int cached; 4635 int cached;
4251 4636
@@ -4436,23 +4821,22 @@ checks:
4436 goto loop; 4821 goto loop;
4437 } 4822 }
4438 4823
4439 if (exclude_nr > 0 && 4824 ins->objectid = search_start;
4440 (search_start + num_bytes > exclude_start && 4825 ins->offset = num_bytes;
4441 search_start < exclude_start + exclude_nr)) { 4826
4442 search_start = exclude_start + exclude_nr; 4827 if (offset < search_start)
4828 btrfs_add_free_space(block_group, offset,
4829 search_start - offset);
4830 BUG_ON(offset > search_start);
4443 4831
4832 ret = update_reserved_bytes(block_group, num_bytes, 1,
4833 (data & BTRFS_BLOCK_GROUP_DATA));
4834 if (ret == -EAGAIN) {
4444 btrfs_add_free_space(block_group, offset, num_bytes); 4835 btrfs_add_free_space(block_group, offset, num_bytes);
4445 /*
4446 * if search_start is still in this block group
4447 * then we just re-search this block group
4448 */
4449 if (search_start >= block_group->key.objectid &&
4450 search_start < (block_group->key.objectid +
4451 block_group->key.offset))
4452 goto have_block_group;
4453 goto loop; 4836 goto loop;
4454 } 4837 }
4455 4838
4839 /* we are all good, lets return */
4456 ins->objectid = search_start; 4840 ins->objectid = search_start;
4457 ins->offset = num_bytes; 4841 ins->offset = num_bytes;
4458 4842
@@ -4460,18 +4844,18 @@ checks:
4460 btrfs_add_free_space(block_group, offset, 4844 btrfs_add_free_space(block_group, offset,
4461 search_start - offset); 4845 search_start - offset);
4462 BUG_ON(offset > search_start); 4846 BUG_ON(offset > search_start);
4463
4464 update_reserved_extents(block_group, num_bytes, 1);
4465
4466 /* we are all good, lets return */
4467 break; 4847 break;
4468loop: 4848loop:
4469 failed_cluster_refill = false; 4849 failed_cluster_refill = false;
4470 failed_alloc = false; 4850 failed_alloc = false;
4851 BUG_ON(index != get_block_group_index(block_group));
4471 btrfs_put_block_group(block_group); 4852 btrfs_put_block_group(block_group);
4472 } 4853 }
4473 up_read(&space_info->groups_sem); 4854 up_read(&space_info->groups_sem);
4474 4855
4856 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4857 goto search;
4858
4475 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4859 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4476 * for them to make caching progress. Also 4860 * for them to make caching progress. Also
4477 * determine the best possible bg to cache 4861 * determine the best possible bg to cache
@@ -4485,6 +4869,7 @@ loop:
4485 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4869 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4486 (found_uncached_bg || empty_size || empty_cluster || 4870 (found_uncached_bg || empty_size || empty_cluster ||
4487 allowed_chunk_alloc)) { 4871 allowed_chunk_alloc)) {
4872 index = 0;
4488 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4873 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4489 found_uncached_bg = false; 4874 found_uncached_bg = false;
4490 loop++; 4875 loop++;
@@ -4567,31 +4952,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4567 int dump_block_groups) 4952 int dump_block_groups)
4568{ 4953{
4569 struct btrfs_block_group_cache *cache; 4954 struct btrfs_block_group_cache *cache;
4955 int index = 0;
4570 4956
4571 spin_lock(&info->lock); 4957 spin_lock(&info->lock);
4572 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4958 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4573 (unsigned long long)(info->total_bytes - info->bytes_used - 4959 (unsigned long long)(info->total_bytes - info->bytes_used -
4574 info->bytes_pinned - info->bytes_reserved - 4960 info->bytes_pinned - info->bytes_reserved -
4575 info->bytes_super), 4961 info->bytes_readonly),
4576 (info->full) ? "" : "not "); 4962 (info->full) ? "" : "not ");
4577 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4963 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
4578 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4964 "reserved=%llu, may_use=%llu, readonly=%llu\n",
4579 "\n",
4580 (unsigned long long)info->total_bytes, 4965 (unsigned long long)info->total_bytes,
4966 (unsigned long long)info->bytes_used,
4581 (unsigned long long)info->bytes_pinned, 4967 (unsigned long long)info->bytes_pinned,
4582 (unsigned long long)info->bytes_delalloc, 4968 (unsigned long long)info->bytes_reserved,
4583 (unsigned long long)info->bytes_may_use, 4969 (unsigned long long)info->bytes_may_use,
4584 (unsigned long long)info->bytes_used, 4970 (unsigned long long)info->bytes_readonly);
4585 (unsigned long long)info->bytes_root,
4586 (unsigned long long)info->bytes_super,
4587 (unsigned long long)info->bytes_reserved);
4588 spin_unlock(&info->lock); 4971 spin_unlock(&info->lock);
4589 4972
4590 if (!dump_block_groups) 4973 if (!dump_block_groups)
4591 return; 4974 return;
4592 4975
4593 down_read(&info->groups_sem); 4976 down_read(&info->groups_sem);
4594 list_for_each_entry(cache, &info->block_groups, list) { 4977again:
4978 list_for_each_entry(cache, &info->block_groups[index], list) {
4595 spin_lock(&cache->lock); 4979 spin_lock(&cache->lock);
4596 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4980 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4597 "%llu pinned %llu reserved\n", 4981 "%llu pinned %llu reserved\n",
@@ -4603,6 +4987,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4603 btrfs_dump_free_space(cache, bytes); 4987 btrfs_dump_free_space(cache, bytes);
4604 spin_unlock(&cache->lock); 4988 spin_unlock(&cache->lock);
4605 } 4989 }
4990 if (++index < BTRFS_NR_RAID_TYPES)
4991 goto again;
4606 up_read(&info->groups_sem); 4992 up_read(&info->groups_sem);
4607} 4993}
4608 4994
@@ -4628,9 +5014,8 @@ again:
4628 5014
4629 WARN_ON(num_bytes < root->sectorsize); 5015 WARN_ON(num_bytes < root->sectorsize);
4630 ret = find_free_extent(trans, root, num_bytes, empty_size, 5016 ret = find_free_extent(trans, root, num_bytes, empty_size,
4631 search_start, search_end, hint_byte, ins, 5017 search_start, search_end, hint_byte,
4632 trans->alloc_exclude_start, 5018 ins, data);
4633 trans->alloc_exclude_nr, data);
4634 5019
4635 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5020 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4636 num_bytes = num_bytes >> 1; 5021 num_bytes = num_bytes >> 1;
@@ -4668,7 +5053,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4668 ret = btrfs_discard_extent(root, start, len); 5053 ret = btrfs_discard_extent(root, start, len);
4669 5054
4670 btrfs_add_free_space(cache, start, len); 5055 btrfs_add_free_space(cache, start, len);
4671 update_reserved_extents(cache, len, 0); 5056 update_reserved_bytes(cache, len, 0, 1);
4672 btrfs_put_block_group(cache); 5057 btrfs_put_block_group(cache);
4673 5058
4674 return ret; 5059 return ret;
@@ -4731,8 +5116,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4731 btrfs_mark_buffer_dirty(path->nodes[0]); 5116 btrfs_mark_buffer_dirty(path->nodes[0]);
4732 btrfs_free_path(path); 5117 btrfs_free_path(path);
4733 5118
4734 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5119 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4735 1, 0);
4736 if (ret) { 5120 if (ret) {
4737 printk(KERN_ERR "btrfs update block group failed for %llu " 5121 printk(KERN_ERR "btrfs update block group failed for %llu "
4738 "%llu\n", (unsigned long long)ins->objectid, 5122 "%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5176,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4792 btrfs_mark_buffer_dirty(leaf); 5176 btrfs_mark_buffer_dirty(leaf);
4793 btrfs_free_path(path); 5177 btrfs_free_path(path);
4794 5178
4795 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5179 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4796 1, 0);
4797 if (ret) { 5180 if (ret) {
4798 printk(KERN_ERR "btrfs update block group failed for %llu " 5181 printk(KERN_ERR "btrfs update block group failed for %llu "
4799 "%llu\n", (unsigned long long)ins->objectid, 5182 "%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5252,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4869 put_caching_control(caching_ctl); 5252 put_caching_control(caching_ctl);
4870 } 5253 }
4871 5254
4872 update_reserved_extents(block_group, ins->offset, 1); 5255 ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5256 BUG_ON(ret);
4873 btrfs_put_block_group(block_group); 5257 btrfs_put_block_group(block_group);
4874 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5258 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4875 0, owner, offset, ins, 1); 5259 0, owner, offset, ins, 1);
4876 return ret; 5260 return ret;
4877} 5261}
4878 5262
4879/*
4880 * finds a free extent and does all the dirty work required for allocation
4881 * returns the key for the extent through ins, and a tree buffer for
4882 * the first block of the extent through buf.
4883 *
4884 * returns 0 if everything worked, non-zero otherwise.
4885 */
4886static int alloc_tree_block(struct btrfs_trans_handle *trans,
4887 struct btrfs_root *root,
4888 u64 num_bytes, u64 parent, u64 root_objectid,
4889 struct btrfs_disk_key *key, int level,
4890 u64 empty_size, u64 hint_byte, u64 search_end,
4891 struct btrfs_key *ins)
4892{
4893 int ret;
4894 u64 flags = 0;
4895
4896 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4897 empty_size, hint_byte, search_end,
4898 ins, 0);
4899 if (ret)
4900 return ret;
4901
4902 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4903 if (parent == 0)
4904 parent = ins->objectid;
4905 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4906 } else
4907 BUG_ON(parent > 0);
4908
4909 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4910 struct btrfs_delayed_extent_op *extent_op;
4911 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4912 BUG_ON(!extent_op);
4913 if (key)
4914 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4915 else
4916 memset(&extent_op->key, 0, sizeof(extent_op->key));
4917 extent_op->flags_to_set = flags;
4918 extent_op->update_key = 1;
4919 extent_op->update_flags = 1;
4920 extent_op->is_data = 0;
4921
4922 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4923 ins->offset, parent, root_objectid,
4924 level, BTRFS_ADD_DELAYED_EXTENT,
4925 extent_op);
4926 BUG_ON(ret);
4927 }
4928
4929 if (root_objectid == root->root_key.objectid) {
4930 u64 used;
4931 spin_lock(&root->node_lock);
4932 used = btrfs_root_used(&root->root_item) + num_bytes;
4933 btrfs_set_root_used(&root->root_item, used);
4934 spin_unlock(&root->node_lock);
4935 }
4936 return ret;
4937}
4938
4939struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5263struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4940 struct btrfs_root *root, 5264 struct btrfs_root *root,
4941 u64 bytenr, u32 blocksize, 5265 u64 bytenr, u32 blocksize,
@@ -4974,8 +5298,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4974 return buf; 5298 return buf;
4975} 5299}
4976 5300
5301static struct btrfs_block_rsv *
5302use_block_rsv(struct btrfs_trans_handle *trans,
5303 struct btrfs_root *root, u32 blocksize)
5304{
5305 struct btrfs_block_rsv *block_rsv;
5306 int ret;
5307
5308 block_rsv = get_block_rsv(trans, root);
5309
5310 if (block_rsv->size == 0) {
5311 ret = reserve_metadata_bytes(block_rsv, blocksize);
5312 if (ret)
5313 return ERR_PTR(ret);
5314 return block_rsv;
5315 }
5316
5317 ret = block_rsv_use_bytes(block_rsv, blocksize);
5318 if (!ret)
5319 return block_rsv;
5320
5321 WARN_ON(1);
5322 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5323 block_rsv->size, block_rsv->reserved,
5324 block_rsv->freed[0], block_rsv->freed[1]);
5325
5326 return ERR_PTR(-ENOSPC);
5327}
5328
5329static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5330{
5331 block_rsv_add_bytes(block_rsv, blocksize, 0);
5332 block_rsv_release_bytes(block_rsv, NULL, 0);
5333}
5334
4977/* 5335/*
4978 * helper function to allocate a block for a given tree 5336 * finds a free extent and does all the dirty work required for allocation
5337 * returns the key for the extent through ins, and a tree buffer for
5338 * the first block of the extent through buf.
5339 *
4979 * returns the tree buffer or NULL. 5340 * returns the tree buffer or NULL.
4980 */ 5341 */
4981struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5342struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5346,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4985 u64 hint, u64 empty_size) 5346 u64 hint, u64 empty_size)
4986{ 5347{
4987 struct btrfs_key ins; 5348 struct btrfs_key ins;
4988 int ret; 5349 struct btrfs_block_rsv *block_rsv;
4989 struct extent_buffer *buf; 5350 struct extent_buffer *buf;
5351 u64 flags = 0;
5352 int ret;
5353
4990 5354
4991 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 5355 block_rsv = use_block_rsv(trans, root, blocksize);
4992 key, level, empty_size, hint, (u64)-1, &ins); 5356 if (IS_ERR(block_rsv))
5357 return ERR_CAST(block_rsv);
5358
5359 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5360 empty_size, hint, (u64)-1, &ins, 0);
4993 if (ret) { 5361 if (ret) {
4994 BUG_ON(ret > 0); 5362 unuse_block_rsv(block_rsv, blocksize);
4995 return ERR_PTR(ret); 5363 return ERR_PTR(ret);
4996 } 5364 }
4997 5365
4998 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5366 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4999 blocksize, level); 5367 blocksize, level);
5368 BUG_ON(IS_ERR(buf));
5369
5370 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5371 if (parent == 0)
5372 parent = ins.objectid;
5373 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5374 } else
5375 BUG_ON(parent > 0);
5376
5377 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5378 struct btrfs_delayed_extent_op *extent_op;
5379 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5380 BUG_ON(!extent_op);
5381 if (key)
5382 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5383 else
5384 memset(&extent_op->key, 0, sizeof(extent_op->key));
5385 extent_op->flags_to_set = flags;
5386 extent_op->update_key = 1;
5387 extent_op->update_flags = 1;
5388 extent_op->is_data = 0;
5389
5390 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5391 ins.offset, parent, root_objectid,
5392 level, BTRFS_ADD_DELAYED_EXTENT,
5393 extent_op);
5394 BUG_ON(ret);
5395 }
5000 return buf; 5396 return buf;
5001} 5397}
5002 5398
@@ -5321,7 +5717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5321 struct btrfs_path *path, 5717 struct btrfs_path *path,
5322 struct walk_control *wc) 5718 struct walk_control *wc)
5323{ 5719{
5324 int ret = 0; 5720 int ret;
5325 int level = wc->level; 5721 int level = wc->level;
5326 struct extent_buffer *eb = path->nodes[level]; 5722 struct extent_buffer *eb = path->nodes[level];
5327 u64 parent = 0; 5723 u64 parent = 0;
@@ -5399,13 +5795,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5399 btrfs_header_owner(path->nodes[level + 1])); 5795 btrfs_header_owner(path->nodes[level + 1]));
5400 } 5796 }
5401 5797
5402 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5798 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5403 root->root_key.objectid, level, 0);
5404 BUG_ON(ret);
5405out: 5799out:
5406 wc->refs[level] = 0; 5800 wc->refs[level] = 0;
5407 wc->flags[level] = 0; 5801 wc->flags[level] = 0;
5408 return ret; 5802 return 0;
5409} 5803}
5410 5804
5411static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5805static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5877,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5483 * also make sure backrefs for the shared block and all lower level 5877 * also make sure backrefs for the shared block and all lower level
5484 * blocks are properly updated. 5878 * blocks are properly updated.
5485 */ 5879 */
5486int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5880int btrfs_drop_snapshot(struct btrfs_root *root,
5881 struct btrfs_block_rsv *block_rsv, int update_ref)
5487{ 5882{
5488 struct btrfs_path *path; 5883 struct btrfs_path *path;
5489 struct btrfs_trans_handle *trans; 5884 struct btrfs_trans_handle *trans;
@@ -5501,7 +5896,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5501 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5896 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5502 BUG_ON(!wc); 5897 BUG_ON(!wc);
5503 5898
5504 trans = btrfs_start_transaction(tree_root, 1); 5899 trans = btrfs_start_transaction(tree_root, 0);
5900 if (block_rsv)
5901 trans->block_rsv = block_rsv;
5505 5902
5506 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5903 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5507 level = btrfs_header_level(root->node); 5904 level = btrfs_header_level(root->node);
@@ -5589,22 +5986,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5589 } 5986 }
5590 5987
5591 BUG_ON(wc->level == 0); 5988 BUG_ON(wc->level == 0);
5592 if (trans->transaction->in_commit || 5989 if (btrfs_should_end_transaction(trans, tree_root)) {
5593 trans->transaction->delayed_refs.flushing) {
5594 ret = btrfs_update_root(trans, tree_root, 5990 ret = btrfs_update_root(trans, tree_root,
5595 &root->root_key, 5991 &root->root_key,
5596 root_item); 5992 root_item);
5597 BUG_ON(ret); 5993 BUG_ON(ret);
5598 5994
5599 btrfs_end_transaction(trans, tree_root); 5995 btrfs_end_transaction_throttle(trans, tree_root);
5600 trans = btrfs_start_transaction(tree_root, 1); 5996 trans = btrfs_start_transaction(tree_root, 0);
5601 } else { 5997 if (block_rsv)
5602 unsigned long update; 5998 trans->block_rsv = block_rsv;
5603 update = trans->delayed_ref_updates;
5604 trans->delayed_ref_updates = 0;
5605 if (update)
5606 btrfs_run_delayed_refs(trans, tree_root,
5607 update);
5608 } 5999 }
5609 } 6000 }
5610 btrfs_release_path(root, path); 6001 btrfs_release_path(root, path);
@@ -5632,7 +6023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5632 kfree(root); 6023 kfree(root);
5633 } 6024 }
5634out: 6025out:
5635 btrfs_end_transaction(trans, tree_root); 6026 btrfs_end_transaction_throttle(trans, tree_root);
5636 kfree(wc); 6027 kfree(wc);
5637 btrfs_free_path(path); 6028 btrfs_free_path(path);
5638 return err; 6029 return err;
@@ -7228,48 +7619,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7228 return flags; 7619 return flags;
7229} 7620}
7230 7621
7231static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7622static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7232 struct btrfs_block_group_cache *shrink_block_group,
7233 int force)
7234{ 7623{
7235 struct btrfs_trans_handle *trans; 7624 struct btrfs_space_info *sinfo = cache->space_info;
7236 u64 new_alloc_flags; 7625 u64 num_bytes;
7237 u64 calc; 7626 int ret = -ENOSPC;
7238 7627
7239 spin_lock(&shrink_block_group->lock); 7628 if (cache->ro)
7240 if (btrfs_block_group_used(&shrink_block_group->item) + 7629 return 0;
7241 shrink_block_group->reserved > 0) {
7242 spin_unlock(&shrink_block_group->lock);
7243 7630
7244 trans = btrfs_start_transaction(root, 1); 7631 spin_lock(&sinfo->lock);
7245 spin_lock(&shrink_block_group->lock); 7632 spin_lock(&cache->lock);
7633 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7634 cache->bytes_super - btrfs_block_group_used(&cache->item);
7635
7636 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7637 sinfo->bytes_may_use + sinfo->bytes_readonly +
7638 cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7639 sinfo->bytes_readonly += num_bytes;
7640 sinfo->bytes_reserved += cache->reserved_pinned;
7641 cache->reserved_pinned = 0;
7642 cache->ro = 1;
7643 ret = 0;
7644 }
7645 spin_unlock(&cache->lock);
7646 spin_unlock(&sinfo->lock);
7647 return ret;
7648}
7246 7649
7247 new_alloc_flags = update_block_group_flags(root, 7650int btrfs_set_block_group_ro(struct btrfs_root *root,
7248 shrink_block_group->flags); 7651 struct btrfs_block_group_cache *cache)
7249 if (new_alloc_flags != shrink_block_group->flags) {
7250 calc =
7251 btrfs_block_group_used(&shrink_block_group->item);
7252 } else {
7253 calc = shrink_block_group->key.offset;
7254 }
7255 spin_unlock(&shrink_block_group->lock);
7256 7652
7257 do_chunk_alloc(trans, root->fs_info->extent_root, 7653{
7258 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7654 struct btrfs_trans_handle *trans;
7655 u64 alloc_flags;
7656 int ret;
7259 7657
7260 btrfs_end_transaction(trans, root); 7658 BUG_ON(cache->ro);
7261 } else 7659
7262 spin_unlock(&shrink_block_group->lock); 7660 trans = btrfs_join_transaction(root, 1);
7263 return 0; 7661 BUG_ON(IS_ERR(trans));
7264}
7265 7662
7663 alloc_flags = update_block_group_flags(root, cache->flags);
7664 if (alloc_flags != cache->flags)
7665 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7266 7666
7267int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7667 ret = set_block_group_ro(cache);
7268 struct btrfs_block_group_cache *group) 7668 if (!ret)
7669 goto out;
7670 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7671 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7672 if (ret < 0)
7673 goto out;
7674 ret = set_block_group_ro(cache);
7675out:
7676 btrfs_end_transaction(trans, root);
7677 return ret;
7678}
7269 7679
7680int btrfs_set_block_group_rw(struct btrfs_root *root,
7681 struct btrfs_block_group_cache *cache)
7270{ 7682{
7271 __alloc_chunk_for_shrink(root, group, 1); 7683 struct btrfs_space_info *sinfo = cache->space_info;
7272 set_block_group_readonly(group); 7684 u64 num_bytes;
7685
7686 BUG_ON(!cache->ro);
7687
7688 spin_lock(&sinfo->lock);
7689 spin_lock(&cache->lock);
7690 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7691 cache->bytes_super - btrfs_block_group_used(&cache->item);
7692 sinfo->bytes_readonly -= num_bytes;
7693 cache->ro = 0;
7694 spin_unlock(&cache->lock);
7695 spin_unlock(&sinfo->lock);
7273 return 0; 7696 return 0;
7274} 7697}
7275 7698
@@ -7436,17 +7859,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7436 */ 7859 */
7437 synchronize_rcu(); 7860 synchronize_rcu();
7438 7861
7862 release_global_block_rsv(info);
7863
7439 while(!list_empty(&info->space_info)) { 7864 while(!list_empty(&info->space_info)) {
7440 space_info = list_entry(info->space_info.next, 7865 space_info = list_entry(info->space_info.next,
7441 struct btrfs_space_info, 7866 struct btrfs_space_info,
7442 list); 7867 list);
7443 7868 if (space_info->bytes_pinned > 0 ||
7869 space_info->bytes_reserved > 0) {
7870 WARN_ON(1);
7871 dump_space_info(space_info, 0, 0);
7872 }
7444 list_del(&space_info->list); 7873 list_del(&space_info->list);
7445 kfree(space_info); 7874 kfree(space_info);
7446 } 7875 }
7447 return 0; 7876 return 0;
7448} 7877}
7449 7878
7879static void __link_block_group(struct btrfs_space_info *space_info,
7880 struct btrfs_block_group_cache *cache)
7881{
7882 int index = get_block_group_index(cache);
7883
7884 down_write(&space_info->groups_sem);
7885 list_add_tail(&cache->list, &space_info->block_groups[index]);
7886 up_write(&space_info->groups_sem);
7887}
7888
7450int btrfs_read_block_groups(struct btrfs_root *root) 7889int btrfs_read_block_groups(struct btrfs_root *root)
7451{ 7890{
7452 struct btrfs_path *path; 7891 struct btrfs_path *path;
@@ -7468,10 +7907,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7468 7907
7469 while (1) { 7908 while (1) {
7470 ret = find_first_block_group(root, path, &key); 7909 ret = find_first_block_group(root, path, &key);
7471 if (ret > 0) { 7910 if (ret > 0)
7472 ret = 0; 7911 break;
7473 goto error;
7474 }
7475 if (ret != 0) 7912 if (ret != 0)
7476 goto error; 7913 goto error;
7477 7914
@@ -7480,7 +7917,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7480 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7917 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7481 if (!cache) { 7918 if (!cache) {
7482 ret = -ENOMEM; 7919 ret = -ENOMEM;
7483 break; 7920 goto error;
7484 } 7921 }
7485 7922
7486 atomic_set(&cache->count, 1); 7923 atomic_set(&cache->count, 1);
@@ -7537,20 +7974,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7537 BUG_ON(ret); 7974 BUG_ON(ret);
7538 cache->space_info = space_info; 7975 cache->space_info = space_info;
7539 spin_lock(&cache->space_info->lock); 7976 spin_lock(&cache->space_info->lock);
7540 cache->space_info->bytes_super += cache->bytes_super; 7977 cache->space_info->bytes_readonly += cache->bytes_super;
7541 spin_unlock(&cache->space_info->lock); 7978 spin_unlock(&cache->space_info->lock);
7542 7979
7543 down_write(&space_info->groups_sem); 7980 __link_block_group(space_info, cache);
7544 list_add_tail(&cache->list, &space_info->block_groups);
7545 up_write(&space_info->groups_sem);
7546 7981
7547 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7982 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7548 BUG_ON(ret); 7983 BUG_ON(ret);
7549 7984
7550 set_avail_alloc_bits(root->fs_info, cache->flags); 7985 set_avail_alloc_bits(root->fs_info, cache->flags);
7551 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7986 if (btrfs_chunk_readonly(root, cache->key.objectid))
7552 set_block_group_readonly(cache); 7987 set_block_group_ro(cache);
7553 } 7988 }
7989
7990 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7991 if (!(get_alloc_profile(root, space_info->flags) &
7992 (BTRFS_BLOCK_GROUP_RAID10 |
7993 BTRFS_BLOCK_GROUP_RAID1 |
7994 BTRFS_BLOCK_GROUP_DUP)))
7995 continue;
7996 /*
7997 * avoid allocating from un-mirrored block group if there are
7998 * mirrored block groups.
7999 */
8000 list_for_each_entry(cache, &space_info->block_groups[3], list)
8001 set_block_group_ro(cache);
8002 list_for_each_entry(cache, &space_info->block_groups[4], list)
8003 set_block_group_ro(cache);
8004 }
8005
8006 init_global_block_rsv(info);
7554 ret = 0; 8007 ret = 0;
7555error: 8008error:
7556 btrfs_free_path(path); 8009 btrfs_free_path(path);
@@ -7611,12 +8064,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7611 BUG_ON(ret); 8064 BUG_ON(ret);
7612 8065
7613 spin_lock(&cache->space_info->lock); 8066 spin_lock(&cache->space_info->lock);
7614 cache->space_info->bytes_super += cache->bytes_super; 8067 cache->space_info->bytes_readonly += cache->bytes_super;
7615 spin_unlock(&cache->space_info->lock); 8068 spin_unlock(&cache->space_info->lock);
7616 8069
7617 down_write(&cache->space_info->groups_sem); 8070 __link_block_group(cache->space_info, cache);
7618 list_add_tail(&cache->list, &cache->space_info->block_groups);
7619 up_write(&cache->space_info->groups_sem);
7620 8071
7621 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8072 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7622 BUG_ON(ret); 8073 BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..a4080c21ec55 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
135 return state; 135 return state;
136} 136}
137 137
138static void free_extent_state(struct extent_state *state) 138void free_extent_state(struct extent_state *state)
139{ 139{
140 if (!state) 140 if (!state)
141 return; 141 return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
335} 335}
336 336
337static int set_state_cb(struct extent_io_tree *tree, 337static int set_state_cb(struct extent_io_tree *tree,
338 struct extent_state *state, 338 struct extent_state *state, int *bits)
339 unsigned long bits)
340{ 339{
341 if (tree->ops && tree->ops->set_bit_hook) { 340 if (tree->ops && tree->ops->set_bit_hook) {
342 return tree->ops->set_bit_hook(tree->mapping->host, 341 return tree->ops->set_bit_hook(tree->mapping->host,
343 state->start, state->end, 342 state, bits);
344 state->state, bits);
345 } 343 }
346 344
347 return 0; 345 return 0;
348} 346}
349 347
350static void clear_state_cb(struct extent_io_tree *tree, 348static void clear_state_cb(struct extent_io_tree *tree,
351 struct extent_state *state, 349 struct extent_state *state, int *bits)
352 unsigned long bits)
353{ 350{
354 if (tree->ops && tree->ops->clear_bit_hook) 351 if (tree->ops && tree->ops->clear_bit_hook)
355 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 352 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
367 */ 364 */
368static int insert_state(struct extent_io_tree *tree, 365static int insert_state(struct extent_io_tree *tree,
369 struct extent_state *state, u64 start, u64 end, 366 struct extent_state *state, u64 start, u64 end,
370 int bits) 367 int *bits)
371{ 368{
372 struct rb_node *node; 369 struct rb_node *node;
370 int bits_to_set = *bits & ~EXTENT_CTLBITS;
373 int ret; 371 int ret;
374 372
375 if (end < start) { 373 if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
384 if (ret) 382 if (ret)
385 return ret; 383 return ret;
386 384
387 if (bits & EXTENT_DIRTY) 385 if (bits_to_set & EXTENT_DIRTY)
388 tree->dirty_bytes += end - start + 1; 386 tree->dirty_bytes += end - start + 1;
389 state->state |= bits; 387 state->state |= bits_to_set;
390 node = tree_insert(&tree->state, end, &state->rb_node); 388 node = tree_insert(&tree->state, end, &state->rb_node);
391 if (node) { 389 if (node) {
392 struct extent_state *found; 390 struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
456 * struct is freed and removed from the tree 454 * struct is freed and removed from the tree
457 */ 455 */
458static int clear_state_bit(struct extent_io_tree *tree, 456static int clear_state_bit(struct extent_io_tree *tree,
459 struct extent_state *state, int bits, int wake, 457 struct extent_state *state,
460 int delete) 458 int *bits, int wake)
461{ 459{
462 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 460 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
463 int ret = state->state & bits_to_clear; 461 int ret = state->state & bits_to_clear;
464 462
465 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 463 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
466 u64 range = state->end - state->start + 1; 464 u64 range = state->end - state->start + 1;
467 WARN_ON(range > tree->dirty_bytes); 465 WARN_ON(range > tree->dirty_bytes);
468 tree->dirty_bytes -= range; 466 tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
471 state->state &= ~bits_to_clear; 469 state->state &= ~bits_to_clear;
472 if (wake) 470 if (wake)
473 wake_up(&state->wq); 471 wake_up(&state->wq);
474 if (delete || state->state == 0) { 472 if (state->state == 0) {
475 if (state->tree) { 473 if (state->tree) {
476 clear_state_cb(tree, state, state->state);
477 rb_erase(&state->rb_node, &tree->state); 474 rb_erase(&state->rb_node, &tree->state);
478 state->tree = NULL; 475 state->tree = NULL;
479 free_extent_state(state); 476 free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
514 int set = 0; 511 int set = 0;
515 int clear = 0; 512 int clear = 0;
516 513
514 if (delete)
515 bits |= ~EXTENT_CTLBITS;
516 bits |= EXTENT_FIRST_DELALLOC;
517
517 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
518 clear = 1; 519 clear = 1;
519again: 520again:
@@ -580,8 +581,7 @@ hit_next:
580 if (err) 581 if (err)
581 goto out; 582 goto out;
582 if (state->end <= end) { 583 if (state->end <= end) {
583 set |= clear_state_bit(tree, state, bits, wake, 584 set |= clear_state_bit(tree, state, &bits, wake);
584 delete);
585 if (last_end == (u64)-1) 585 if (last_end == (u64)-1)
586 goto out; 586 goto out;
587 start = last_end + 1; 587 start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
602 if (wake) 602 if (wake)
603 wake_up(&state->wq); 603 wake_up(&state->wq);
604 604
605 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 605 set |= clear_state_bit(tree, prealloc, &bits, wake);
606 606
607 prealloc = NULL; 607 prealloc = NULL;
608 goto out; 608 goto out;
@@ -613,7 +613,7 @@ hit_next:
613 else 613 else
614 next_node = NULL; 614 next_node = NULL;
615 615
616 set |= clear_state_bit(tree, state, bits, wake, delete); 616 set |= clear_state_bit(tree, state, &bits, wake);
617 if (last_end == (u64)-1) 617 if (last_end == (u64)-1)
618 goto out; 618 goto out;
619 start = last_end + 1; 619 start = last_end + 1;
@@ -706,19 +706,19 @@ out:
706 706
707static int set_state_bits(struct extent_io_tree *tree, 707static int set_state_bits(struct extent_io_tree *tree,
708 struct extent_state *state, 708 struct extent_state *state,
709 int bits) 709 int *bits)
710{ 710{
711 int ret; 711 int ret;
712 int bits_to_set = *bits & ~EXTENT_CTLBITS;
712 713
713 ret = set_state_cb(tree, state, bits); 714 ret = set_state_cb(tree, state, bits);
714 if (ret) 715 if (ret)
715 return ret; 716 return ret;
716 717 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
717 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
718 u64 range = state->end - state->start + 1; 718 u64 range = state->end - state->start + 1;
719 tree->dirty_bytes += range; 719 tree->dirty_bytes += range;
720 } 720 }
721 state->state |= bits; 721 state->state |= bits_to_set;
722 722
723 return 0; 723 return 0;
724} 724}
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
745 * [start, end] is inclusive This takes the tree lock. 745 * [start, end] is inclusive This takes the tree lock.
746 */ 746 */
747 747
748static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 748int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
749 int bits, int exclusive_bits, u64 *failed_start, 749 int bits, int exclusive_bits, u64 *failed_start,
750 struct extent_state **cached_state, 750 struct extent_state **cached_state, gfp_t mask)
751 gfp_t mask)
752{ 751{
753 struct extent_state *state; 752 struct extent_state *state;
754 struct extent_state *prealloc = NULL; 753 struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
757 u64 last_start; 756 u64 last_start;
758 u64 last_end; 757 u64 last_end;
759 758
759 bits |= EXTENT_FIRST_DELALLOC;
760again: 760again:
761 if (!prealloc && (mask & __GFP_WAIT)) { 761 if (!prealloc && (mask & __GFP_WAIT)) {
762 prealloc = alloc_extent_state(mask); 762 prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
778 */ 778 */
779 node = tree_search(tree, start); 779 node = tree_search(tree, start);
780 if (!node) { 780 if (!node) {
781 err = insert_state(tree, prealloc, start, end, bits); 781 err = insert_state(tree, prealloc, start, end, &bits);
782 prealloc = NULL; 782 prealloc = NULL;
783 BUG_ON(err == -EEXIST); 783 BUG_ON(err == -EEXIST);
784 goto out; 784 goto out;
@@ -802,7 +802,7 @@ hit_next:
802 goto out; 802 goto out;
803 } 803 }
804 804
805 err = set_state_bits(tree, state, bits); 805 err = set_state_bits(tree, state, &bits);
806 if (err) 806 if (err)
807 goto out; 807 goto out;
808 808
@@ -852,7 +852,7 @@ hit_next:
852 if (err) 852 if (err)
853 goto out; 853 goto out;
854 if (state->end <= end) { 854 if (state->end <= end) {
855 err = set_state_bits(tree, state, bits); 855 err = set_state_bits(tree, state, &bits);
856 if (err) 856 if (err)
857 goto out; 857 goto out;
858 cache_state(state, cached_state); 858 cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
877 else 877 else
878 this_end = last_start - 1; 878 this_end = last_start - 1;
879 err = insert_state(tree, prealloc, start, this_end, 879 err = insert_state(tree, prealloc, start, this_end,
880 bits); 880 &bits);
881 BUG_ON(err == -EEXIST); 881 BUG_ON(err == -EEXIST);
882 if (err) { 882 if (err) {
883 prealloc = NULL; 883 prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
903 err = split_state(tree, state, prealloc, end + 1); 903 err = split_state(tree, state, prealloc, end + 1);
904 BUG_ON(err == -EEXIST); 904 BUG_ON(err == -EEXIST);
905 905
906 err = set_state_bits(tree, prealloc, bits); 906 err = set_state_bits(tree, prealloc, &bits);
907 if (err) { 907 if (err) {
908 prealloc = NULL; 908 prealloc = NULL;
909 goto out; 909 goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
966{ 966{
967 return clear_extent_bit(tree, start, end, 967 return clear_extent_bit(tree, start, end,
968 EXTENT_DIRTY | EXTENT_DELALLOC | 968 EXTENT_DIRTY | EXTENT_DELALLOC |
969 EXTENT_DO_ACCOUNTING, 0, 0, 969 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
970 NULL, mask);
971} 970}
972 971
973int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 972int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1435 if (op & EXTENT_CLEAR_DELALLOC) 1434 if (op & EXTENT_CLEAR_DELALLOC)
1436 clear_bits |= EXTENT_DELALLOC; 1435 clear_bits |= EXTENT_DELALLOC;
1437 1436
1438 if (op & EXTENT_CLEAR_ACCOUNTING)
1439 clear_bits |= EXTENT_DO_ACCOUNTING;
1440
1441 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1437 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1442 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1438 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1439 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1916 1912
1917 if (tree->ops && tree->ops->submit_bio_hook) 1913 if (tree->ops && tree->ops->submit_bio_hook)
1918 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1914 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1919 mirror_num, bio_flags); 1915 mirror_num, bio_flags, start);
1920 else 1916 else
1921 submit_bio(rw, bio); 1917 submit_bio(rw, bio);
1922 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1918 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2020 sector_t sector; 2016 sector_t sector;
2021 struct extent_map *em; 2017 struct extent_map *em;
2022 struct block_device *bdev; 2018 struct block_device *bdev;
2019 struct btrfs_ordered_extent *ordered;
2023 int ret; 2020 int ret;
2024 int nr = 0; 2021 int nr = 0;
2025 size_t page_offset = 0; 2022 size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2031 set_page_extent_mapped(page); 2028 set_page_extent_mapped(page);
2032 2029
2033 end = page_end; 2030 end = page_end;
2034 lock_extent(tree, start, end, GFP_NOFS); 2031 while (1) {
2032 lock_extent(tree, start, end, GFP_NOFS);
2033 ordered = btrfs_lookup_ordered_extent(inode, start);
2034 if (!ordered)
2035 break;
2036 unlock_extent(tree, start, end, GFP_NOFS);
2037 btrfs_start_ordered_extent(inode, ordered, 1);
2038 btrfs_put_ordered_extent(ordered);
2039 }
2035 2040
2036 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2041 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2037 char *userpage; 2042 char *userpage;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
20 22
21/* flags for bio submission */ 23/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1 24#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
47 49
48typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 50typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
49 struct bio *bio, int mirror_num, 51 struct bio *bio, int mirror_num,
50 unsigned long bio_flags); 52 unsigned long bio_flags, u64 bio_offset);
51struct extent_io_ops { 53struct extent_io_ops {
52 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 54 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
53 u64 start, u64 end, int *page_started, 55 u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
69 struct extent_state *state); 71 struct extent_state *state);
70 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 72 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
71 struct extent_state *state, int uptodate); 73 struct extent_state *state, int uptodate);
72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
73 unsigned long old, unsigned long bits); 75 int *bits);
74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 76 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
75 unsigned long bits); 77 int *bits);
76 int (*merge_extent_hook)(struct inode *inode, 78 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new, 79 struct extent_state *new,
78 struct extent_state *other); 80 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
176 u64 *start, u64 search_end, 178 u64 *start, u64 search_end,
177 u64 max_bytes, unsigned long bits); 179 u64 max_bytes, unsigned long bits);
178 180
181void free_extent_state(struct extent_state *state);
179int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
180 int bits, int filled, struct extent_state *cached_state); 183 int bits, int filled, struct extent_state *cached_state);
181int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 184int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
185 gfp_t mask); 188 gfp_t mask);
186int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 189int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
187 int bits, gfp_t mask); 190 int bits, gfp_t mask);
191int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask);
188int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
189 gfp_t mask); 195 gfp_t mask);
190int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
149} 149}
150 150
151 151
152int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
153 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
154{ 155{
155 u32 sum; 156 u32 sum;
156 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
157 int bio_index = 0; 158 int bio_index = 0;
158 u64 offset; 159 u64 offset = 0;
159 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
160 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
161 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
174 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
175 176
176 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
177 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
178 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
179 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
180 if (ret == 0) 184 if (ret == 0)
181 goto found; 185 goto found;
@@ -238,6 +242,7 @@ found:
238 else 242 else
239 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
240 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
241 bio_index++; 246 bio_index++;
242 bvec++; 247 bvec++;
243 } 248 }
@@ -245,6 +250,18 @@ found:
245 return 0; 250 return 0;
246} 251}
247 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
248int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
249 struct list_head *list) 266 struct list_head *list)
250{ 267{
@@ -657,6 +674,9 @@ again:
657 goto found; 674 goto found;
658 } 675 }
659 ret = PTR_ERR(item); 676 ret = PTR_ERR(item);
677 if (ret != -EFBIG && ret != -ENOENT)
678 goto fail_unlock;
679
660 if (ret == -EFBIG) { 680 if (ret == -EFBIG) {
661 u32 item_size; 681 u32 item_size;
662 /* we found one, but it isn't big enough yet */ 682 /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..787b50a16a14 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
47 int write_bytes, 47 int write_bytes,
48 struct page **prepared_pages, 48 struct page **prepared_pages,
49 const char __user *buf) 49 struct iov_iter *i)
50{ 50{
51 long page_fault = 0; 51 size_t copied;
52 int i; 52 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 53 int offset = pos & (PAGE_CACHE_SIZE - 1);
54 54
55 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { 55 while (write_bytes > 0) {
56 size_t count = min_t(size_t, 56 size_t count = min_t(size_t,
57 PAGE_CACHE_SIZE - offset, write_bytes); 57 PAGE_CACHE_SIZE - offset, write_bytes);
58 struct page *page = prepared_pages[i]; 58 struct page *page = prepared_pages[pg];
59 fault_in_pages_readable(buf, count); 59again:
60 if (unlikely(iov_iter_fault_in_readable(i, count)))
61 return -EFAULT;
60 62
61 /* Copy data from userspace to the current page */ 63 /* Copy data from userspace to the current page */
62 kmap(page); 64 copied = iov_iter_copy_from_user(page, i, offset, count);
63 page_fault = __copy_from_user(page_address(page) + offset, 65
64 buf, count);
65 /* Flush processor's dcache for this page */ 66 /* Flush processor's dcache for this page */
66 flush_dcache_page(page); 67 flush_dcache_page(page);
67 kunmap(page); 68 iov_iter_advance(i, copied);
68 buf += count; 69 write_bytes -= copied;
69 write_bytes -= count;
70 70
71 if (page_fault) 71 if (unlikely(copied == 0)) {
72 break; 72 count = min_t(size_t, PAGE_CACHE_SIZE - offset,
73 iov_iter_single_seg_count(i));
74 goto again;
75 }
76
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 offset += copied;
79 } else {
80 pg++;
81 offset = 0;
82 }
73 } 83 }
74 return page_fault ? -EFAULT : 0; 84 return 0;
75} 85}
76 86
77/* 87/*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
126 end_of_last_block = start_pos + num_bytes - 1; 136 end_of_last_block = start_pos + num_bytes - 1;
127 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 137 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
128 NULL); 138 NULL);
129 if (err) 139 BUG_ON(err);
130 return err;
131 140
132 for (i = 0; i < num_pages; i++) { 141 for (i = 0; i < num_pages; i++) {
133 struct page *p = pages[i]; 142 struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
142 * at this time. 151 * at this time.
143 */ 152 */
144 } 153 }
145 return err; 154 return 0;
146} 155}
147 156
148/* 157/*
@@ -823,45 +832,46 @@ again:
823 return 0; 832 return 0;
824} 833}
825 834
826static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 835static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
827 size_t count, loff_t *ppos) 836 const struct iovec *iov,
837 unsigned long nr_segs, loff_t pos)
828{ 838{
829 loff_t pos; 839 struct file *file = iocb->ki_filp;
840 struct inode *inode = fdentry(file)->d_inode;
841 struct btrfs_root *root = BTRFS_I(inode)->root;
842 struct page *pinned[2];
843 struct page **pages = NULL;
844 struct iov_iter i;
845 loff_t *ppos = &iocb->ki_pos;
830 loff_t start_pos; 846 loff_t start_pos;
831 ssize_t num_written = 0; 847 ssize_t num_written = 0;
832 ssize_t err = 0; 848 ssize_t err = 0;
849 size_t count;
850 size_t ocount;
833 int ret = 0; 851 int ret = 0;
834 struct inode *inode = fdentry(file)->d_inode;
835 struct btrfs_root *root = BTRFS_I(inode)->root;
836 struct page **pages = NULL;
837 int nrptrs; 852 int nrptrs;
838 struct page *pinned[2];
839 unsigned long first_index; 853 unsigned long first_index;
840 unsigned long last_index; 854 unsigned long last_index;
841 int will_write; 855 int will_write;
856 int buffered = 0;
842 857
843 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
844 (file->f_flags & O_DIRECT)); 859 (file->f_flags & O_DIRECT));
845 860
846 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
847 PAGE_CACHE_SIZE / (sizeof(struct page *)));
848 pinned[0] = NULL; 861 pinned[0] = NULL;
849 pinned[1] = NULL; 862 pinned[1] = NULL;
850 863
851 pos = *ppos;
852 start_pos = pos; 864 start_pos = pos;
853 865
854 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 866 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
855 867
856 /* do the reserve before the mutex lock in case we have to do some
857 * flushing. We wouldn't deadlock, but this is more polite.
858 */
859 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
860 if (err)
861 goto out_nolock;
862
863 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
864 869
870 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
871 if (err)
872 goto out;
873 count = ocount;
874
865 current->backing_dev_info = inode->i_mapping->backing_dev_info; 875 current->backing_dev_info = inode->i_mapping->backing_dev_info;
866 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 876 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
867 if (err) 877 if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
875 goto out; 885 goto out;
876 886
877 file_update_time(file); 887 file_update_time(file);
888 BTRFS_I(inode)->sequence++;
889
890 if (unlikely(file->f_flags & O_DIRECT)) {
891 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 pos, ppos, count,
893 ocount);
894 /*
895 * the generic O_DIRECT will update in-memory i_size after the
896 * DIOs are done. But our endio handlers that update the on
897 * disk i_size never update past the in memory i_size. So we
898 * need one more update here to catch any additions to the
899 * file
900 */
901 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
902 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
903 mark_inode_dirty(inode);
904 }
878 905
906 if (num_written < 0) {
907 ret = num_written;
908 num_written = 0;
909 goto out;
910 } else if (num_written == count) {
911 /* pick up pos changes done by the generic code */
912 pos = *ppos;
913 goto out;
914 }
915 /*
916 * We are going to do buffered for the rest of the range, so we
917 * need to make sure to invalidate the buffered pages when we're
918 * done.
919 */
920 buffered = 1;
921 pos += num_written;
922 }
923
924 iov_iter_init(&i, iov, nr_segs, count, num_written);
925 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 (sizeof(struct page *)));
879 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 928 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
880 929
881 /* generic_write_checks can change our pos */ 930 /* generic_write_checks can change our pos */
882 start_pos = pos; 931 start_pos = pos;
883 932
884 BTRFS_I(inode)->sequence++;
885 first_index = pos >> PAGE_CACHE_SHIFT; 933 first_index = pos >> PAGE_CACHE_SHIFT;
886 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 934 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
887 935
888 /* 936 /*
889 * there are lots of better ways to do this, but this code 937 * there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
900 unlock_page(pinned[0]); 948 unlock_page(pinned[0]);
901 } 949 }
902 } 950 }
903 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { 951 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
904 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 952 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
905 if (!PageUptodate(pinned[1])) { 953 if (!PageUptodate(pinned[1])) {
906 ret = btrfs_readpage(NULL, pinned[1]); 954 ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
911 } 959 }
912 } 960 }
913 961
914 while (count > 0) { 962 while (iov_iter_count(&i) > 0) {
915 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 963 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
916 size_t write_bytes = min(count, nrptrs * 964 size_t write_bytes = min(iov_iter_count(&i),
917 (size_t)PAGE_CACHE_SIZE - 965 nrptrs * (size_t)PAGE_CACHE_SIZE -
918 offset); 966 offset);
919 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 967 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
920 PAGE_CACHE_SHIFT; 968 PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
922 WARN_ON(num_pages > nrptrs); 970 WARN_ON(num_pages > nrptrs);
923 memset(pages, 0, sizeof(struct page *) * nrptrs); 971 memset(pages, 0, sizeof(struct page *) * nrptrs);
924 972
925 ret = btrfs_check_data_free_space(root, inode, write_bytes); 973 ret = btrfs_delalloc_reserve_space(inode, write_bytes);
926 if (ret) 974 if (ret)
927 goto out; 975 goto out;
928 976
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
930 pos, first_index, last_index, 978 pos, first_index, last_index,
931 write_bytes); 979 write_bytes);
932 if (ret) { 980 if (ret) {
933 btrfs_free_reserved_data_space(root, inode, 981 btrfs_delalloc_release_space(inode, write_bytes);
934 write_bytes);
935 goto out; 982 goto out;
936 } 983 }
937 984
938 ret = btrfs_copy_from_user(pos, num_pages, 985 ret = btrfs_copy_from_user(pos, num_pages,
939 write_bytes, pages, buf); 986 write_bytes, pages, &i);
940 if (ret) { 987 if (ret == 0) {
941 btrfs_free_reserved_data_space(root, inode, 988 dirty_and_release_pages(NULL, root, file, pages,
942 write_bytes); 989 num_pages, pos, write_bytes);
943 btrfs_drop_pages(pages, num_pages);
944 goto out;
945 } 990 }
946 991
947 ret = dirty_and_release_pages(NULL, root, file, pages,
948 num_pages, pos, write_bytes);
949 btrfs_drop_pages(pages, num_pages); 992 btrfs_drop_pages(pages, num_pages);
950 if (ret) { 993 if (ret) {
951 btrfs_free_reserved_data_space(root, inode, 994 btrfs_delalloc_release_space(inode, write_bytes);
952 write_bytes);
953 goto out; 995 goto out;
954 } 996 }
955 997
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
965 btrfs_throttle(root); 1007 btrfs_throttle(root);
966 } 1008 }
967 1009
968 buf += write_bytes;
969 count -= write_bytes;
970 pos += write_bytes; 1010 pos += write_bytes;
971 num_written += write_bytes; 1011 num_written += write_bytes;
972 1012
@@ -976,9 +1016,7 @@ out:
976 mutex_unlock(&inode->i_mutex); 1016 mutex_unlock(&inode->i_mutex);
977 if (ret) 1017 if (ret)
978 err = ret; 1018 err = ret;
979 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
980 1019
981out_nolock:
982 kfree(pages); 1020 kfree(pages);
983 if (pinned[0]) 1021 if (pinned[0])
984 page_cache_release(pinned[0]); 1022 page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
1008 num_written = err; 1046 num_written = err;
1009 1047
1010 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1011 trans = btrfs_start_transaction(root, 1); 1049 trans = btrfs_start_transaction(root, 0);
1012 ret = btrfs_log_dentry_safe(trans, root, 1050 ret = btrfs_log_dentry_safe(trans, root,
1013 file->f_dentry); 1051 file->f_dentry);
1014 if (ret == 0) { 1052 if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
1023 btrfs_end_transaction(trans, root); 1061 btrfs_end_transaction(trans, root);
1024 } 1062 }
1025 } 1063 }
1026 if (file->f_flags & O_DIRECT) { 1064 if (file->f_flags & O_DIRECT && buffered) {
1027 invalidate_mapping_pages(inode->i_mapping, 1065 invalidate_mapping_pages(inode->i_mapping,
1028 start_pos >> PAGE_CACHE_SHIFT, 1066 start_pos >> PAGE_CACHE_SHIFT,
1029 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1063 * important optimization for directories because holding the mutex prevents 1101 * important optimization for directories because holding the mutex prevents
1064 * new operations on the dir while we write to disk. 1102 * new operations on the dir while we write to disk.
1065 */ 1103 */
1066int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 1104int btrfs_sync_file(struct file *file, int datasync)
1067{ 1105{
1106 struct dentry *dentry = file->f_path.dentry;
1068 struct inode *inode = dentry->d_inode; 1107 struct inode *inode = dentry->d_inode;
1069 struct btrfs_root *root = BTRFS_I(inode)->root; 1108 struct btrfs_root *root = BTRFS_I(inode)->root;
1070 int ret = 0; 1109 int ret = 0;
@@ -1104,9 +1143,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1104 if (file && file->private_data) 1143 if (file && file->private_data)
1105 btrfs_ioctl_trans_end(file); 1144 btrfs_ioctl_trans_end(file);
1106 1145
1107 trans = btrfs_start_transaction(root, 1); 1146 trans = btrfs_start_transaction(root, 0);
1108 if (!trans) { 1147 if (IS_ERR(trans)) {
1109 ret = -ENOMEM; 1148 ret = PTR_ERR(trans);
1110 goto out; 1149 goto out;
1111 } 1150 }
1112 1151
@@ -1161,7 +1200,7 @@ const struct file_operations btrfs_file_operations = {
1161 .read = do_sync_read, 1200 .read = do_sync_read,
1162 .aio_read = generic_file_aio_read, 1201 .aio_read = generic_file_aio_read,
1163 .splice_read = generic_file_splice_read, 1202 .splice_read = generic_file_splice_read,
1164 .write = btrfs_file_write, 1203 .aio_write = btrfs_file_aio_write,
1165 .mmap = btrfs_file_mmap, 1204 .mmap = btrfs_file_mmap,
1166 .open = generic_file_open, 1205 .open = generic_file_open,
1167 .release = btrfs_release_file, 1206 .release = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
49 return 0; 49 return 0;
50} 50}
51 51
52struct btrfs_inode_ref *
53btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path,
56 const char *name, int name_len,
57 u64 inode_objectid, u64 ref_objectid, int mod)
58{
59 struct btrfs_key key;
60 struct btrfs_inode_ref *ref;
61 int ins_len = mod < 0 ? -1 : 0;
62 int cow = mod != 0;
63 int ret;
64
65 key.objectid = inode_objectid;
66 key.type = BTRFS_INODE_REF_KEY;
67 key.offset = ref_objectid;
68
69 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0)
73 return NULL;
74 if (!find_name_in_backref(path, name, name_len, &ref))
75 return NULL;
76 return ref;
77}
78
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 79int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root, 80 struct btrfs_root *root,
54 const char *name, int name_len, 81 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d601629b85d1..fa6ccc1bfe2a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
252 inline_len, compressed_size, 252 inline_len, compressed_size,
253 compressed_pages); 253 compressed_pages);
254 BUG_ON(ret); 254 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start);
255 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
256 return 0; 257 return 0;
257} 258}
@@ -414,6 +415,7 @@ again:
414 trans = btrfs_join_transaction(root, 1); 415 trans = btrfs_join_transaction(root, 1);
415 BUG_ON(!trans); 416 BUG_ON(!trans);
416 btrfs_set_trans_block_group(trans, inode); 417 btrfs_set_trans_block_group(trans, inode);
418 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
417 419
418 /* lets try to make an inline extent */ 420 /* lets try to make an inline extent */
419 if (ret || total_in < (actual_end - start)) { 421 if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
439 start, end, NULL, 441 start, end, NULL,
440 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 442 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
441 EXTENT_CLEAR_DELALLOC | 443 EXTENT_CLEAR_DELALLOC |
442 EXTENT_CLEAR_ACCOUNTING |
443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
444 445
445 btrfs_end_transaction(trans, root); 446 btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
697 return 0; 698 return 0;
698} 699}
699 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
700/* 733/*
701 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
702 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
734 trans = btrfs_join_transaction(root, 1); 767 trans = btrfs_join_transaction(root, 1);
735 BUG_ON(!trans); 768 BUG_ON(!trans);
736 btrfs_set_trans_block_group(trans, inode); 769 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
737 771
738 actual_end = min_t(u64, isize, end + 1); 772 actual_end = min_t(u64, isize, end + 1);
739 773
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
753 EXTENT_CLEAR_UNLOCK_PAGE | 787 EXTENT_CLEAR_UNLOCK_PAGE |
754 EXTENT_CLEAR_UNLOCK | 788 EXTENT_CLEAR_UNLOCK |
755 EXTENT_CLEAR_DELALLOC | 789 EXTENT_CLEAR_DELALLOC |
756 EXTENT_CLEAR_ACCOUNTING |
757 EXTENT_CLEAR_DIRTY | 790 EXTENT_CLEAR_DIRTY |
758 EXTENT_SET_WRITEBACK | 791 EXTENT_SET_WRITEBACK |
759 EXTENT_END_WRITEBACK); 792 EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
769 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
770 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
771 804
772 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
773 read_lock(&BTRFS_I(inode)->extent_tree.lock);
774 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
775 start, num_bytes);
776 if (em) {
777 /*
778 * if block start isn't an actual block number then find the
779 * first block in this inode and use that as a hint. If that
780 * block is also bogus then just don't worry about it.
781 */
782 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
783 free_extent_map(em);
784 em = search_extent_mapping(em_tree, 0, 0);
785 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
786 alloc_hint = em->block_start;
787 if (em)
788 free_extent_map(em);
789 } else {
790 alloc_hint = em->block_start;
791 free_extent_map(em);
792 }
793 }
794 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
795 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
796 807
797 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
1174 num_bytes, num_bytes, type); 1185 num_bytes, num_bytes, type);
1175 BUG_ON(ret); 1186 BUG_ON(ret);
1176 1187
1188 if (root->root_key.objectid ==
1189 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1190 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1191 num_bytes);
1192 BUG_ON(ret);
1193 }
1194
1177 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1195 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1178 cur_offset, cur_offset + num_bytes - 1, 1196 cur_offset, cur_offset + num_bytes - 1,
1179 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1197 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1226} 1244}
1227 1245
1228static int btrfs_split_extent_hook(struct inode *inode, 1246static int btrfs_split_extent_hook(struct inode *inode,
1229 struct extent_state *orig, u64 split) 1247 struct extent_state *orig, u64 split)
1230{ 1248{
1249 /* not delalloc, ignore it */
1231 if (!(orig->state & EXTENT_DELALLOC)) 1250 if (!(orig->state & EXTENT_DELALLOC))
1232 return 0; 1251 return 0;
1233 1252
1234 spin_lock(&BTRFS_I(inode)->accounting_lock); 1253 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1235 BTRFS_I(inode)->outstanding_extents++;
1236 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1237
1238 return 0; 1254 return 0;
1239} 1255}
1240 1256
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1252 if (!(other->state & EXTENT_DELALLOC)) 1268 if (!(other->state & EXTENT_DELALLOC))
1253 return 0; 1269 return 0;
1254 1270
1255 spin_lock(&BTRFS_I(inode)->accounting_lock); 1271 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1256 BTRFS_I(inode)->outstanding_extents--;
1257 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1258
1259 return 0; 1272 return 0;
1260} 1273}
1261 1274
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1264 * bytes in this file, and to maintain the list of inodes that 1277 * bytes in this file, and to maintain the list of inodes that
1265 * have pending delalloc work to be done. 1278 * have pending delalloc work to be done.
1266 */ 1279 */
1267static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1280static int btrfs_set_bit_hook(struct inode *inode,
1268 unsigned long old, unsigned long bits) 1281 struct extent_state *state, int *bits)
1269{ 1282{
1270 1283
1271 /* 1284 /*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1273 * but in this case, we are only testeing for the DELALLOC 1286 * but in this case, we are only testeing for the DELALLOC
1274 * bit, which is only set or cleared with irqs on 1287 * bit, which is only set or cleared with irqs on
1275 */ 1288 */
1276 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1277 struct btrfs_root *root = BTRFS_I(inode)->root; 1290 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start;
1278 1292
1279 spin_lock(&BTRFS_I(inode)->accounting_lock); 1293 if (*bits & EXTENT_FIRST_DELALLOC)
1280 BTRFS_I(inode)->outstanding_extents++; 1294 *bits &= ~EXTENT_FIRST_DELALLOC;
1281 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1295 else
1282 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1296 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1283 1297
1284 spin_lock(&root->fs_info->delalloc_lock); 1298 spin_lock(&root->fs_info->delalloc_lock);
1285 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1299 BTRFS_I(inode)->delalloc_bytes += len;
1286 root->fs_info->delalloc_bytes += end - start + 1; 1300 root->fs_info->delalloc_bytes += len;
1287 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1288 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1289 &root->fs_info->delalloc_inodes); 1303 &root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1297 * extent_io.c clear_bit_hook, see set_bit_hook for why 1311 * extent_io.c clear_bit_hook, see set_bit_hook for why
1298 */ 1312 */
1299static int btrfs_clear_bit_hook(struct inode *inode, 1313static int btrfs_clear_bit_hook(struct inode *inode,
1300 struct extent_state *state, unsigned long bits) 1314 struct extent_state *state, int *bits)
1301{ 1315{
1302 /* 1316 /*
1303 * set_bit and clear bit hooks normally require _irqsave/restore 1317 * set_bit and clear bit hooks normally require _irqsave/restore
1304 * but in this case, we are only testeing for the DELALLOC 1318 * but in this case, we are only testeing for the DELALLOC
1305 * bit, which is only set or cleared with irqs on 1319 * bit, which is only set or cleared with irqs on
1306 */ 1320 */
1307 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1308 struct btrfs_root *root = BTRFS_I(inode)->root; 1322 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start;
1309 1324
1310 if (bits & EXTENT_DO_ACCOUNTING) { 1325 if (*bits & EXTENT_FIRST_DELALLOC)
1311 spin_lock(&BTRFS_I(inode)->accounting_lock); 1326 *bits &= ~EXTENT_FIRST_DELALLOC;
1312 WARN_ON(!BTRFS_I(inode)->outstanding_extents); 1327 else if (!(*bits & EXTENT_DO_ACCOUNTING))
1313 BTRFS_I(inode)->outstanding_extents--; 1328 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1314 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1329
1315 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1330 if (*bits & EXTENT_DO_ACCOUNTING)
1316 } 1331 btrfs_delalloc_release_metadata(inode, len);
1332
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1334 btrfs_free_reserved_data_space(inode, len);
1317 1335
1318 spin_lock(&root->fs_info->delalloc_lock); 1336 spin_lock(&root->fs_info->delalloc_lock);
1319 if (state->end - state->start + 1 > 1337 root->fs_info->delalloc_bytes -= len;
1320 root->fs_info->delalloc_bytes) { 1338 BTRFS_I(inode)->delalloc_bytes -= len;
1321 printk(KERN_INFO "btrfs warning: delalloc account " 1339
1322 "%llu %llu\n",
1323 (unsigned long long)
1324 state->end - state->start + 1,
1325 (unsigned long long)
1326 root->fs_info->delalloc_bytes);
1327 btrfs_delalloc_free_space(root, inode, (u64)-1);
1328 root->fs_info->delalloc_bytes = 0;
1329 BTRFS_I(inode)->delalloc_bytes = 0;
1330 } else {
1331 btrfs_delalloc_free_space(root, inode,
1332 state->end -
1333 state->start + 1);
1334 root->fs_info->delalloc_bytes -= state->end -
1335 state->start + 1;
1336 BTRFS_I(inode)->delalloc_bytes -= state->end -
1337 state->start + 1;
1338 }
1339 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1340 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1340 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1341 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1384 */ 1385 */
1385static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1386static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1386 struct bio *bio, int mirror_num, 1387 struct bio *bio, int mirror_num,
1387 unsigned long bio_flags) 1388 unsigned long bio_flags,
1389 u64 bio_offset)
1388{ 1390{
1389 struct btrfs_root *root = BTRFS_I(inode)->root; 1391 struct btrfs_root *root = BTRFS_I(inode)->root;
1390 int ret = 0; 1392 int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1403 * are inserted into the btree 1405 * are inserted into the btree
1404 */ 1406 */
1405static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1407static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1406 int mirror_num, unsigned long bio_flags) 1408 int mirror_num, unsigned long bio_flags,
1409 u64 bio_offset)
1407{ 1410{
1408 struct btrfs_root *root = BTRFS_I(inode)->root; 1411 struct btrfs_root *root = BTRFS_I(inode)->root;
1409 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1412 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1414 * on write, or reading the csums from the tree before a read 1417 * on write, or reading the csums from the tree before a read
1415 */ 1418 */
1416static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1419static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1417 int mirror_num, unsigned long bio_flags) 1420 int mirror_num, unsigned long bio_flags,
1421 u64 bio_offset)
1418{ 1422{
1419 struct btrfs_root *root = BTRFS_I(inode)->root; 1423 struct btrfs_root *root = BTRFS_I(inode)->root;
1420 int ret = 0; 1424 int ret = 0;
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1439 /* we're doing a write, do the async checksumming */ 1443 /* we're doing a write, do the async checksumming */
1440 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1444 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1441 inode, rw, bio, mirror_num, 1445 inode, rw, bio, mirror_num,
1442 bio_flags, __btrfs_submit_bio_start, 1446 bio_flags, bio_offset,
1447 __btrfs_submit_bio_start,
1443 __btrfs_submit_bio_done); 1448 __btrfs_submit_bio_done);
1444 } 1449 }
1445 1450
@@ -1520,6 +1525,7 @@ again:
1520 goto again; 1525 goto again;
1521 } 1526 }
1522 1527
1528 BUG();
1523 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1529 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1524 ClearPageChecked(page); 1530 ClearPageChecked(page);
1525out: 1531out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1650static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1656static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1651{ 1657{
1652 struct btrfs_root *root = BTRFS_I(inode)->root; 1658 struct btrfs_root *root = BTRFS_I(inode)->root;
1653 struct btrfs_trans_handle *trans; 1659 struct btrfs_trans_handle *trans = NULL;
1654 struct btrfs_ordered_extent *ordered_extent = NULL; 1660 struct btrfs_ordered_extent *ordered_extent = NULL;
1655 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1661 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1656 struct extent_state *cached_state = NULL; 1662 struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1668 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1669 if (!ret) { 1675 if (!ret) {
1670 trans = btrfs_join_transaction(root, 1); 1676 trans = btrfs_join_transaction(root, 1);
1677 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1671 ret = btrfs_update_inode(trans, root, inode); 1679 ret = btrfs_update_inode(trans, root, inode);
1672 BUG_ON(ret); 1680 BUG_ON(ret);
1673 btrfs_end_transaction(trans, root);
1674 } 1681 }
1675 goto out; 1682 goto out;
1676 } 1683 }
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1680 0, &cached_state, GFP_NOFS); 1687 0, &cached_state, GFP_NOFS);
1681 1688
1682 trans = btrfs_join_transaction(root, 1); 1689 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1683 1692
1684 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1693 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1685 compressed = 1; 1694 compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1711 add_pending_csums(trans, inode, ordered_extent->file_offset, 1720 add_pending_csums(trans, inode, ordered_extent->file_offset,
1712 &ordered_extent->list); 1721 &ordered_extent->list);
1713 1722
1714 /* this also removes the ordered extent from the tree */
1715 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1723 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1716 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
1717 BUG_ON(ret); 1725 BUG_ON(ret);
1718 btrfs_end_transaction(trans, root);
1719out: 1726out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1728 if (trans)
1729 btrfs_end_transaction(trans, root);
1720 /* once for us */ 1730 /* once for us */
1721 btrfs_put_ordered_extent(ordered_extent); 1731 btrfs_put_ordered_extent(ordered_extent);
1722 /* once for the tree */ 1732 /* once for the tree */
@@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1838 1848
1839 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1849 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1840 failrec->last_mirror, 1850 failrec->last_mirror,
1841 failrec->bio_flags); 1851 failrec->bio_flags, 0);
1842 return 0; 1852 return 0;
1843} 1853}
1844 1854
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
1993} 2003}
1994 2004
1995/* 2005/*
2006 * calculate extra metadata reservation when snapshotting a subvolume
2007 * contains orphan files.
2008 */
2009void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2010 struct btrfs_pending_snapshot *pending,
2011 u64 *bytes_to_reserve)
2012{
2013 struct btrfs_root *root;
2014 struct btrfs_block_rsv *block_rsv;
2015 u64 num_bytes;
2016 int index;
2017
2018 root = pending->root;
2019 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2020 return;
2021
2022 block_rsv = root->orphan_block_rsv;
2023
2024 /* orphan block reservation for the snapshot */
2025 num_bytes = block_rsv->size;
2026
2027 /*
2028 * after the snapshot is created, COWing tree blocks may use more
2029 * space than it frees. So we should make sure there is enough
2030 * reserved space.
2031 */
2032 index = trans->transid & 0x1;
2033 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2034 num_bytes += block_rsv->size -
2035 (block_rsv->reserved + block_rsv->freed[index]);
2036 }
2037
2038 *bytes_to_reserve += num_bytes;
2039}
2040
2041void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2042 struct btrfs_pending_snapshot *pending)
2043{
2044 struct btrfs_root *root = pending->root;
2045 struct btrfs_root *snap = pending->snap;
2046 struct btrfs_block_rsv *block_rsv;
2047 u64 num_bytes;
2048 int index;
2049 int ret;
2050
2051 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2052 return;
2053
2054 /* refill source subvolume's orphan block reservation */
2055 block_rsv = root->orphan_block_rsv;
2056 index = trans->transid & 0x1;
2057 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2058 num_bytes = block_rsv->size -
2059 (block_rsv->reserved + block_rsv->freed[index]);
2060 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2061 root->orphan_block_rsv,
2062 num_bytes);
2063 BUG_ON(ret);
2064 }
2065
2066 /* setup orphan block reservation for the snapshot */
2067 block_rsv = btrfs_alloc_block_rsv(snap);
2068 BUG_ON(!block_rsv);
2069
2070 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2071 snap->orphan_block_rsv = block_rsv;
2072
2073 num_bytes = root->orphan_block_rsv->size;
2074 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2075 block_rsv, num_bytes);
2076 BUG_ON(ret);
2077
2078#if 0
2079 /* insert orphan item for the snapshot */
2080 WARN_ON(!root->orphan_item_inserted);
2081 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2082 snap->root_key.objectid);
2083 BUG_ON(ret);
2084 snap->orphan_item_inserted = 1;
2085#endif
2086}
2087
2088enum btrfs_orphan_cleanup_state {
2089 ORPHAN_CLEANUP_STARTED = 1,
2090 ORPHAN_CLEANUP_DONE = 2,
2091};
2092
2093/*
2094 * This is called in transaction commmit time. If there are no orphan
2095 * files in the subvolume, it removes orphan item and frees block_rsv
2096 * structure.
2097 */
2098void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root)
2100{
2101 int ret;
2102
2103 if (!list_empty(&root->orphan_list) ||
2104 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2105 return;
2106
2107 if (root->orphan_item_inserted &&
2108 btrfs_root_refs(&root->root_item) > 0) {
2109 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2110 root->root_key.objectid);
2111 BUG_ON(ret);
2112 root->orphan_item_inserted = 0;
2113 }
2114
2115 if (root->orphan_block_rsv) {
2116 WARN_ON(root->orphan_block_rsv->size > 0);
2117 btrfs_free_block_rsv(root, root->orphan_block_rsv);
2118 root->orphan_block_rsv = NULL;
2119 }
2120}
2121
2122/*
1996 * This creates an orphan entry for the given inode in case something goes 2123 * This creates an orphan entry for the given inode in case something goes
1997 * wrong in the middle of an unlink/truncate. 2124 * wrong in the middle of an unlink/truncate.
2125 *
2126 * NOTE: caller of this function should reserve 5 units of metadata for
2127 * this function.
1998 */ 2128 */
1999int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2129int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2000{ 2130{
2001 struct btrfs_root *root = BTRFS_I(inode)->root; 2131 struct btrfs_root *root = BTRFS_I(inode)->root;
2002 int ret = 0; 2132 struct btrfs_block_rsv *block_rsv = NULL;
2133 int reserve = 0;
2134 int insert = 0;
2135 int ret;
2003 2136
2004 spin_lock(&root->list_lock); 2137 if (!root->orphan_block_rsv) {
2138 block_rsv = btrfs_alloc_block_rsv(root);
2139 BUG_ON(!block_rsv);
2140 }
2005 2141
2006 /* already on the orphan list, we're good */ 2142 spin_lock(&root->orphan_lock);
2007 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2143 if (!root->orphan_block_rsv) {
2008 spin_unlock(&root->list_lock); 2144 root->orphan_block_rsv = block_rsv;
2009 return 0; 2145 } else if (block_rsv) {
2146 btrfs_free_block_rsv(root, block_rsv);
2147 block_rsv = NULL;
2148 }
2149
2150 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2151 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2152#if 0
2153 /*
2154 * For proper ENOSPC handling, we should do orphan
2155 * cleanup when mounting. But this introduces backward
2156 * compatibility issue.
2157 */
2158 if (!xchg(&root->orphan_item_inserted, 1))
2159 insert = 2;
2160 else
2161 insert = 1;
2162#endif
2163 insert = 1;
2164 } else {
2165 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2010 } 2166 }
2011 2167
2012 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2168 if (!BTRFS_I(inode)->orphan_meta_reserved) {
2169 BTRFS_I(inode)->orphan_meta_reserved = 1;
2170 reserve = 1;
2171 }
2172 spin_unlock(&root->orphan_lock);
2013 2173
2014 spin_unlock(&root->list_lock); 2174 if (block_rsv)
2175 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2015 2176
2016 /* 2177 /* grab metadata reservation from transaction handle */
2017 * insert an orphan item to track this unlinked/truncated file 2178 if (reserve) {
2018 */ 2179 ret = btrfs_orphan_reserve_metadata(trans, inode);
2019 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2180 BUG_ON(ret);
2181 }
2020 2182
2021 return ret; 2183 /* insert an orphan item to track this unlinked/truncated file */
2184 if (insert >= 1) {
2185 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2186 BUG_ON(ret);
2187 }
2188
2189 /* insert an orphan item to track subvolume contains orphan files */
2190 if (insert >= 2) {
2191 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2192 root->root_key.objectid);
2193 BUG_ON(ret);
2194 }
2195 return 0;
2022} 2196}
2023 2197
2024/* 2198/*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2028int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2202int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2029{ 2203{
2030 struct btrfs_root *root = BTRFS_I(inode)->root; 2204 struct btrfs_root *root = BTRFS_I(inode)->root;
2205 int delete_item = 0;
2206 int release_rsv = 0;
2031 int ret = 0; 2207 int ret = 0;
2032 2208
2033 spin_lock(&root->list_lock); 2209 spin_lock(&root->orphan_lock);
2034 2210 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2035 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2211 list_del_init(&BTRFS_I(inode)->i_orphan);
2036 spin_unlock(&root->list_lock); 2212 delete_item = 1;
2037 return 0;
2038 } 2213 }
2039 2214
2040 list_del_init(&BTRFS_I(inode)->i_orphan); 2215 if (BTRFS_I(inode)->orphan_meta_reserved) {
2041 if (!trans) { 2216 BTRFS_I(inode)->orphan_meta_reserved = 0;
2042 spin_unlock(&root->list_lock); 2217 release_rsv = 1;
2043 return 0;
2044 } 2218 }
2219 spin_unlock(&root->orphan_lock);
2045 2220
2046 spin_unlock(&root->list_lock); 2221 if (trans && delete_item) {
2222 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2223 BUG_ON(ret);
2224 }
2047 2225
2048 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2226 if (release_rsv)
2227 btrfs_orphan_release_metadata(inode);
2049 2228
2050 return ret; 2229 return 0;
2051} 2230}
2052 2231
2053/* 2232/*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2064 struct inode *inode; 2243 struct inode *inode;
2065 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2244 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2066 2245
2067 if (!xchg(&root->clean_orphans, 0)) 2246 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2068 return; 2247 return;
2069 2248
2070 path = btrfs_alloc_path(); 2249 path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2117 found_key.type = BTRFS_INODE_ITEM_KEY; 2296 found_key.type = BTRFS_INODE_ITEM_KEY;
2118 found_key.offset = 0; 2297 found_key.offset = 0;
2119 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2120 if (IS_ERR(inode)) 2299 BUG_ON(IS_ERR(inode));
2121 break;
2122 2300
2123 /* 2301 /*
2124 * add this inode to the orphan list so btrfs_orphan_del does 2302 * add this inode to the orphan list so btrfs_orphan_del does
2125 * the proper thing when we hit it 2303 * the proper thing when we hit it
2126 */ 2304 */
2127 spin_lock(&root->list_lock); 2305 spin_lock(&root->orphan_lock);
2128 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2306 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2129 spin_unlock(&root->list_lock); 2307 spin_unlock(&root->orphan_lock);
2130 2308
2131 /* 2309 /*
2132 * if this is a bad inode, means we actually succeeded in 2310 * if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2135 * do a destroy_inode 2313 * do a destroy_inode
2136 */ 2314 */
2137 if (is_bad_inode(inode)) { 2315 if (is_bad_inode(inode)) {
2138 trans = btrfs_start_transaction(root, 1); 2316 trans = btrfs_start_transaction(root, 0);
2139 btrfs_orphan_del(trans, inode); 2317 btrfs_orphan_del(trans, inode);
2140 btrfs_end_transaction(trans, root); 2318 btrfs_end_transaction(trans, root);
2141 iput(inode); 2319 iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2153 /* this will do delete_inode and everything for us */ 2331 /* this will do delete_inode and everything for us */
2154 iput(inode); 2332 iput(inode);
2155 } 2333 }
2334 btrfs_free_path(path);
2335
2336 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2337
2338 if (root->orphan_block_rsv)
2339 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2340 (u64)-1);
2341
2342 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2343 trans = btrfs_join_transaction(root, 1);
2344 btrfs_end_transaction(trans, root);
2345 }
2156 2346
2157 if (nr_unlink) 2347 if (nr_unlink)
2158 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2348 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2159 if (nr_truncate) 2349 if (nr_truncate)
2160 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2350 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2161
2162 btrfs_free_path(path);
2163} 2351}
2164 2352
2165/* 2353/*
@@ -2478,29 +2666,201 @@ out:
2478 return ret; 2666 return ret;
2479} 2667}
2480 2668
2481static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2669/* helper to check if there is any shared block in the path */
2670static int check_path_shared(struct btrfs_root *root,
2671 struct btrfs_path *path)
2672{
2673 struct extent_buffer *eb;
2674 int level;
2675 int ret;
2676 u64 refs;
2677
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level])
2680 break;
2681 eb = path->nodes[level];
2682 if (!btrfs_block_can_be_shared(root, eb))
2683 continue;
2684 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2685 &refs, NULL);
2686 if (refs > 1)
2687 return 1;
2688 }
2689 return 0;
2690}
2691
2692/*
2693 * helper to start transaction for unlink and rmdir.
2694 *
2695 * unlink and rmdir are special in btrfs, they do not always free space.
2696 * so in enospc case, we should make sure they will free space before
2697 * allowing them to use the global metadata reservation.
2698 */
2699static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2700 struct dentry *dentry)
2482{ 2701{
2483 struct btrfs_root *root;
2484 struct btrfs_trans_handle *trans; 2702 struct btrfs_trans_handle *trans;
2703 struct btrfs_root *root = BTRFS_I(dir)->root;
2704 struct btrfs_path *path;
2705 struct btrfs_inode_ref *ref;
2706 struct btrfs_dir_item *di;
2485 struct inode *inode = dentry->d_inode; 2707 struct inode *inode = dentry->d_inode;
2708 u64 index;
2709 int check_link = 1;
2710 int err = -ENOSPC;
2486 int ret; 2711 int ret;
2487 unsigned long nr = 0;
2488 2712
2489 root = BTRFS_I(dir)->root; 2713 trans = btrfs_start_transaction(root, 10);
2714 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2715 return trans;
2490 2716
2491 /* 2717 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2492 * 5 items for unlink inode 2718 return ERR_PTR(-ENOSPC);
2493 * 1 for orphan 2719
2494 */ 2720 /* check if there is someone else holds reference */
2495 ret = btrfs_reserve_metadata_space(root, 6); 2721 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2496 if (ret) 2722 return ERR_PTR(-ENOSPC);
2497 return ret; 2723
2724 if (atomic_read(&inode->i_count) > 2)
2725 return ERR_PTR(-ENOSPC);
2726
2727 if (xchg(&root->fs_info->enospc_unlink, 1))
2728 return ERR_PTR(-ENOSPC);
2498 2729
2499 trans = btrfs_start_transaction(root, 1); 2730 path = btrfs_alloc_path();
2731 if (!path) {
2732 root->fs_info->enospc_unlink = 0;
2733 return ERR_PTR(-ENOMEM);
2734 }
2735
2736 trans = btrfs_start_transaction(root, 0);
2500 if (IS_ERR(trans)) { 2737 if (IS_ERR(trans)) {
2501 btrfs_unreserve_metadata_space(root, 6); 2738 btrfs_free_path(path);
2502 return PTR_ERR(trans); 2739 root->fs_info->enospc_unlink = 0;
2740 return trans;
2741 }
2742
2743 path->skip_locking = 1;
2744 path->search_commit_root = 1;
2745
2746 ret = btrfs_lookup_inode(trans, root, path,
2747 &BTRFS_I(dir)->location, 0);
2748 if (ret < 0) {
2749 err = ret;
2750 goto out;
2751 }
2752 if (ret == 0) {
2753 if (check_path_shared(root, path))
2754 goto out;
2755 } else {
2756 check_link = 0;
2757 }
2758 btrfs_release_path(root, path);
2759
2760 ret = btrfs_lookup_inode(trans, root, path,
2761 &BTRFS_I(inode)->location, 0);
2762 if (ret < 0) {
2763 err = ret;
2764 goto out;
2765 }
2766 if (ret == 0) {
2767 if (check_path_shared(root, path))
2768 goto out;
2769 } else {
2770 check_link = 0;
2771 }
2772 btrfs_release_path(root, path);
2773
2774 if (ret == 0 && S_ISREG(inode->i_mode)) {
2775 ret = btrfs_lookup_file_extent(trans, root, path,
2776 inode->i_ino, (u64)-1, 0);
2777 if (ret < 0) {
2778 err = ret;
2779 goto out;
2780 }
2781 BUG_ON(ret == 0);
2782 if (check_path_shared(root, path))
2783 goto out;
2784 btrfs_release_path(root, path);
2785 }
2786
2787 if (!check_link) {
2788 err = 0;
2789 goto out;
2790 }
2791
2792 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2793 dentry->d_name.name, dentry->d_name.len, 0);
2794 if (IS_ERR(di)) {
2795 err = PTR_ERR(di);
2796 goto out;
2797 }
2798 if (di) {
2799 if (check_path_shared(root, path))
2800 goto out;
2801 } else {
2802 err = 0;
2803 goto out;
2503 } 2804 }
2805 btrfs_release_path(root, path);
2806
2807 ref = btrfs_lookup_inode_ref(trans, root, path,
2808 dentry->d_name.name, dentry->d_name.len,
2809 inode->i_ino, dir->i_ino, 0);
2810 if (IS_ERR(ref)) {
2811 err = PTR_ERR(ref);
2812 goto out;
2813 }
2814 BUG_ON(!ref);
2815 if (check_path_shared(root, path))
2816 goto out;
2817 index = btrfs_inode_ref_index(path->nodes[0], ref);
2818 btrfs_release_path(root, path);
2819
2820 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
2821 dentry->d_name.name, dentry->d_name.len, 0);
2822 if (IS_ERR(di)) {
2823 err = PTR_ERR(di);
2824 goto out;
2825 }
2826 BUG_ON(ret == -ENOENT);
2827 if (check_path_shared(root, path))
2828 goto out;
2829
2830 err = 0;
2831out:
2832 btrfs_free_path(path);
2833 if (err) {
2834 btrfs_end_transaction(trans, root);
2835 root->fs_info->enospc_unlink = 0;
2836 return ERR_PTR(err);
2837 }
2838
2839 trans->block_rsv = &root->fs_info->global_block_rsv;
2840 return trans;
2841}
2842
2843static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2844 struct btrfs_root *root)
2845{
2846 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2847 BUG_ON(!root->fs_info->enospc_unlink);
2848 root->fs_info->enospc_unlink = 0;
2849 }
2850 btrfs_end_transaction_throttle(trans, root);
2851}
2852
2853static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2854{
2855 struct btrfs_root *root = BTRFS_I(dir)->root;
2856 struct btrfs_trans_handle *trans;
2857 struct inode *inode = dentry->d_inode;
2858 int ret;
2859 unsigned long nr = 0;
2860
2861 trans = __unlink_start_trans(dir, dentry);
2862 if (IS_ERR(trans))
2863 return PTR_ERR(trans);
2504 2864
2505 btrfs_set_trans_block_group(trans, dir); 2865 btrfs_set_trans_block_group(trans, dir);
2506 2866
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2508 2868
2509 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2869 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2510 dentry->d_name.name, dentry->d_name.len); 2870 dentry->d_name.name, dentry->d_name.len);
2871 BUG_ON(ret);
2511 2872
2512 if (inode->i_nlink == 0) 2873 if (inode->i_nlink == 0) {
2513 ret = btrfs_orphan_add(trans, inode); 2874 ret = btrfs_orphan_add(trans, inode);
2875 BUG_ON(ret);
2876 }
2514 2877
2515 nr = trans->blocks_used; 2878 nr = trans->blocks_used;
2516 2879 __unlink_end_trans(trans, root);
2517 btrfs_end_transaction_throttle(trans, root);
2518 btrfs_unreserve_metadata_space(root, 6);
2519 btrfs_btree_balance_dirty(root, nr); 2880 btrfs_btree_balance_dirty(root, nr);
2520 return ret; 2881 return ret;
2521} 2882}
@@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2587{ 2948{
2588 struct inode *inode = dentry->d_inode; 2949 struct inode *inode = dentry->d_inode;
2589 int err = 0; 2950 int err = 0;
2590 int ret;
2591 struct btrfs_root *root = BTRFS_I(dir)->root; 2951 struct btrfs_root *root = BTRFS_I(dir)->root;
2592 struct btrfs_trans_handle *trans; 2952 struct btrfs_trans_handle *trans;
2593 unsigned long nr = 0; 2953 unsigned long nr = 0;
@@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2596 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2956 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2597 return -ENOTEMPTY; 2957 return -ENOTEMPTY;
2598 2958
2599 ret = btrfs_reserve_metadata_space(root, 5); 2959 trans = __unlink_start_trans(dir, dentry);
2600 if (ret) 2960 if (IS_ERR(trans))
2601 return ret;
2602
2603 trans = btrfs_start_transaction(root, 1);
2604 if (IS_ERR(trans)) {
2605 btrfs_unreserve_metadata_space(root, 5);
2606 return PTR_ERR(trans); 2961 return PTR_ERR(trans);
2607 }
2608 2962
2609 btrfs_set_trans_block_group(trans, dir); 2963 btrfs_set_trans_block_group(trans, dir);
2610 2964
@@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2627 btrfs_i_size_write(inode, 0); 2981 btrfs_i_size_write(inode, 0);
2628out: 2982out:
2629 nr = trans->blocks_used; 2983 nr = trans->blocks_used;
2630 ret = btrfs_end_transaction_throttle(trans, root); 2984 __unlink_end_trans(trans, root);
2631 btrfs_unreserve_metadata_space(root, 5);
2632 btrfs_btree_balance_dirty(root, nr); 2985 btrfs_btree_balance_dirty(root, nr);
2633 2986
2634 if (ret && !err)
2635 err = ret;
2636 return err; 2987 return err;
2637} 2988}
2638 2989
@@ -3029,6 +3380,7 @@ out:
3029 if (pending_del_nr) { 3380 if (pending_del_nr) {
3030 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3381 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3031 pending_del_nr); 3382 pending_del_nr);
3383 BUG_ON(ret);
3032 } 3384 }
3033 btrfs_free_path(path); 3385 btrfs_free_path(path);
3034 return err; 3386 return err;
@@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3056 3408
3057 if ((offset & (blocksize - 1)) == 0) 3409 if ((offset & (blocksize - 1)) == 0)
3058 goto out; 3410 goto out;
3059 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3411 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3060 if (ret)
3061 goto out;
3062
3063 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3064 if (ret) 3412 if (ret)
3065 goto out; 3413 goto out;
3066 3414
@@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3068again: 3416again:
3069 page = grab_cache_page(mapping, index); 3417 page = grab_cache_page(mapping, index);
3070 if (!page) { 3418 if (!page) {
3071 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3419 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3072 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3073 goto out; 3420 goto out;
3074 } 3421 }
3075 3422
@@ -3132,8 +3479,7 @@ again:
3132 3479
3133out_unlock: 3480out_unlock:
3134 if (ret) 3481 if (ret)
3135 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3482 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3136 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3137 unlock_page(page); 3483 unlock_page(page);
3138 page_cache_release(page); 3484 page_cache_release(page);
3139out: 3485out:
@@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3145 struct btrfs_trans_handle *trans; 3491 struct btrfs_trans_handle *trans;
3146 struct btrfs_root *root = BTRFS_I(inode)->root; 3492 struct btrfs_root *root = BTRFS_I(inode)->root;
3147 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3493 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3148 struct extent_map *em; 3494 struct extent_map *em = NULL;
3149 struct extent_state *cached_state = NULL; 3495 struct extent_state *cached_state = NULL;
3150 u64 mask = root->sectorsize - 1; 3496 u64 mask = root->sectorsize - 1;
3151 u64 hole_start = (inode->i_size + mask) & ~mask; 3497 u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3183 u64 hint_byte = 0; 3529 u64 hint_byte = 0;
3184 hole_size = last_byte - cur_offset; 3530 hole_size = last_byte - cur_offset;
3185 3531
3186 err = btrfs_reserve_metadata_space(root, 2); 3532 trans = btrfs_start_transaction(root, 2);
3187 if (err) 3533 if (IS_ERR(trans)) {
3534 err = PTR_ERR(trans);
3188 break; 3535 break;
3189 3536 }
3190 trans = btrfs_start_transaction(root, 1);
3191 btrfs_set_trans_block_group(trans, inode); 3537 btrfs_set_trans_block_group(trans, inode);
3192 3538
3193 err = btrfs_drop_extents(trans, inode, cur_offset, 3539 err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3205 last_byte - 1, 0); 3551 last_byte - 1, 0);
3206 3552
3207 btrfs_end_transaction(trans, root); 3553 btrfs_end_transaction(trans, root);
3208 btrfs_unreserve_metadata_space(root, 2);
3209 } 3554 }
3210 free_extent_map(em); 3555 free_extent_map(em);
3556 em = NULL;
3211 cur_offset = last_byte; 3557 cur_offset = last_byte;
3212 if (cur_offset >= block_end) 3558 if (cur_offset >= block_end)
3213 break; 3559 break;
3214 } 3560 }
3215 3561
3562 free_extent_map(em);
3216 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3563 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3217 GFP_NOFS); 3564 GFP_NOFS);
3218 return err; 3565 return err;
@@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3239 } 3586 }
3240 } 3587 }
3241 3588
3242 ret = btrfs_reserve_metadata_space(root, 1); 3589 trans = btrfs_start_transaction(root, 5);
3243 if (ret) 3590 if (IS_ERR(trans))
3244 return ret; 3591 return PTR_ERR(trans);
3245 3592
3246 trans = btrfs_start_transaction(root, 1);
3247 btrfs_set_trans_block_group(trans, inode); 3593 btrfs_set_trans_block_group(trans, inode);
3248 3594
3249 ret = btrfs_orphan_add(trans, inode); 3595 ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3251 3597
3252 nr = trans->blocks_used; 3598 nr = trans->blocks_used;
3253 btrfs_end_transaction(trans, root); 3599 btrfs_end_transaction(trans, root);
3254 btrfs_unreserve_metadata_space(root, 1);
3255 btrfs_btree_balance_dirty(root, nr); 3600 btrfs_btree_balance_dirty(root, nr);
3256 3601
3257 if (attr->ia_size > inode->i_size) { 3602 if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3264 i_size_write(inode, attr->ia_size); 3609 i_size_write(inode, attr->ia_size);
3265 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3610 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3266 3611
3267 trans = btrfs_start_transaction(root, 1); 3612 trans = btrfs_start_transaction(root, 0);
3613 BUG_ON(IS_ERR(trans));
3268 btrfs_set_trans_block_group(trans, inode); 3614 btrfs_set_trans_block_group(trans, inode);
3615 trans->block_rsv = root->orphan_block_rsv;
3616 BUG_ON(!trans->block_rsv);
3269 3617
3270 ret = btrfs_update_inode(trans, root, inode); 3618 ret = btrfs_update_inode(trans, root, inode);
3271 BUG_ON(ret); 3619 BUG_ON(ret);
@@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
3345 btrfs_i_size_write(inode, 0); 3693 btrfs_i_size_write(inode, 0);
3346 3694
3347 while (1) { 3695 while (1) {
3348 trans = btrfs_start_transaction(root, 1); 3696 trans = btrfs_start_transaction(root, 0);
3697 BUG_ON(IS_ERR(trans));
3349 btrfs_set_trans_block_group(trans, inode); 3698 btrfs_set_trans_block_group(trans, inode);
3350 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3699 trans->block_rsv = root->orphan_block_rsv;
3700
3701 ret = btrfs_block_rsv_check(trans, root,
3702 root->orphan_block_rsv, 0, 5);
3703 if (ret) {
3704 BUG_ON(ret != -EAGAIN);
3705 ret = btrfs_commit_transaction(trans, root);
3706 BUG_ON(ret);
3707 continue;
3708 }
3351 3709
3710 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3352 if (ret != -EAGAIN) 3711 if (ret != -EAGAIN)
3353 break; 3712 break;
3354 3713
@@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
3356 btrfs_end_transaction(trans, root); 3715 btrfs_end_transaction(trans, root);
3357 trans = NULL; 3716 trans = NULL;
3358 btrfs_btree_balance_dirty(root, nr); 3717 btrfs_btree_balance_dirty(root, nr);
3718
3359 } 3719 }
3360 3720
3361 if (ret == 0) { 3721 if (ret == 0) {
@@ -3596,40 +3956,10 @@ again:
3596 return 0; 3956 return 0;
3597} 3957}
3598 3958
3599static noinline void init_btrfs_i(struct inode *inode)
3600{
3601 struct btrfs_inode *bi = BTRFS_I(inode);
3602
3603 bi->generation = 0;
3604 bi->sequence = 0;
3605 bi->last_trans = 0;
3606 bi->last_sub_trans = 0;
3607 bi->logged_trans = 0;
3608 bi->delalloc_bytes = 0;
3609 bi->reserved_bytes = 0;
3610 bi->disk_i_size = 0;
3611 bi->flags = 0;
3612 bi->index_cnt = (u64)-1;
3613 bi->last_unlink_trans = 0;
3614 bi->ordered_data_close = 0;
3615 bi->force_compress = 0;
3616 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3617 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3618 inode->i_mapping, GFP_NOFS);
3619 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3620 inode->i_mapping, GFP_NOFS);
3621 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3622 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3623 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3624 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3625 mutex_init(&BTRFS_I(inode)->log_mutex);
3626}
3627
3628static int btrfs_init_locked_inode(struct inode *inode, void *p) 3959static int btrfs_init_locked_inode(struct inode *inode, void *p)
3629{ 3960{
3630 struct btrfs_iget_args *args = p; 3961 struct btrfs_iget_args *args = p;
3631 inode->i_ino = args->ino; 3962 inode->i_ino = args->ino;
3632 init_btrfs_i(inode);
3633 BTRFS_I(inode)->root = args->root; 3963 BTRFS_I(inode)->root = args->root;
3634 btrfs_set_inode_space_info(args->root, inode); 3964 btrfs_set_inode_space_info(args->root, inode);
3635 return 0; 3965 return 0;
@@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
3692 if (!inode) 4022 if (!inode)
3693 return ERR_PTR(-ENOMEM); 4023 return ERR_PTR(-ENOMEM);
3694 4024
3695 init_btrfs_i(inode);
3696
3697 BTRFS_I(inode)->root = root; 4025 BTRFS_I(inode)->root = root;
3698 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4026 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3699 BTRFS_I(inode)->dummy_inode = 1; 4027 BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3950 struct btrfs_trans_handle *trans; 4278 struct btrfs_trans_handle *trans;
3951 int ret = 0; 4279 int ret = 0;
3952 4280
3953 if (root->fs_info->btree_inode == inode) 4281 if (BTRFS_I(inode)->dummy_inode)
3954 return 0; 4282 return 0;
3955 4283
3956 if (wbc->sync_mode == WB_SYNC_ALL) { 4284 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
3971{ 4299{
3972 struct btrfs_root *root = BTRFS_I(inode)->root; 4300 struct btrfs_root *root = BTRFS_I(inode)->root;
3973 struct btrfs_trans_handle *trans; 4301 struct btrfs_trans_handle *trans;
4302 int ret;
4303
4304 if (BTRFS_I(inode)->dummy_inode)
4305 return;
3974 4306
3975 trans = btrfs_join_transaction(root, 1); 4307 trans = btrfs_join_transaction(root, 1);
3976 btrfs_set_trans_block_group(trans, inode); 4308 btrfs_set_trans_block_group(trans, inode);
3977 btrfs_update_inode(trans, root, inode); 4309
4310 ret = btrfs_update_inode(trans, root, inode);
4311 if (ret && ret == -ENOSPC) {
4312 /* whoops, lets try again with the full transaction */
4313 btrfs_end_transaction(trans, root);
4314 trans = btrfs_start_transaction(root, 1);
4315 if (IS_ERR(trans)) {
4316 if (printk_ratelimit()) {
4317 printk(KERN_ERR "btrfs: fail to "
4318 "dirty inode %lu error %ld\n",
4319 inode->i_ino, PTR_ERR(trans));
4320 }
4321 return;
4322 }
4323 btrfs_set_trans_block_group(trans, inode);
4324
4325 ret = btrfs_update_inode(trans, root, inode);
4326 if (ret) {
4327 if (printk_ratelimit()) {
4328 printk(KERN_ERR "btrfs: fail to "
4329 "dirty inode %lu error %d\n",
4330 inode->i_ino, ret);
4331 }
4332 }
4333 }
3978 btrfs_end_transaction(trans, root); 4334 btrfs_end_transaction(trans, root);
3979} 4335}
3980 4336
@@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4092 * btrfs_get_inode_index_count has an explanation for the magic 4448 * btrfs_get_inode_index_count has an explanation for the magic
4093 * number 4449 * number
4094 */ 4450 */
4095 init_btrfs_i(inode);
4096 BTRFS_I(inode)->index_cnt = 2; 4451 BTRFS_I(inode)->index_cnt = 2;
4097 BTRFS_I(inode)->root = root; 4452 BTRFS_I(inode)->root = root;
4098 BTRFS_I(inode)->generation = trans->transid; 4453 BTRFS_I(inode)->generation = trans->transid;
@@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4247 if (!new_valid_dev(rdev)) 4602 if (!new_valid_dev(rdev))
4248 return -EINVAL; 4603 return -EINVAL;
4249 4604
4605 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4606 if (err)
4607 return err;
4608
4250 /* 4609 /*
4251 * 2 for inode item and ref 4610 * 2 for inode item and ref
4252 * 2 for dir items 4611 * 2 for dir items
4253 * 1 for xattr if selinux is on 4612 * 1 for xattr if selinux is on
4254 */ 4613 */
4255 err = btrfs_reserve_metadata_space(root, 5); 4614 trans = btrfs_start_transaction(root, 5);
4256 if (err) 4615 if (IS_ERR(trans))
4257 return err; 4616 return PTR_ERR(trans);
4258 4617
4259 trans = btrfs_start_transaction(root, 1);
4260 if (!trans)
4261 goto fail;
4262 btrfs_set_trans_block_group(trans, dir); 4618 btrfs_set_trans_block_group(trans, dir);
4263 4619
4264 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4265 if (err) {
4266 err = -ENOSPC;
4267 goto out_unlock;
4268 }
4269
4270 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4620 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4271 dentry->d_name.len, 4621 dentry->d_name.len,
4272 dentry->d_parent->d_inode->i_ino, objectid, 4622 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4295out_unlock: 4645out_unlock:
4296 nr = trans->blocks_used; 4646 nr = trans->blocks_used;
4297 btrfs_end_transaction_throttle(trans, root); 4647 btrfs_end_transaction_throttle(trans, root);
4298fail: 4648 btrfs_btree_balance_dirty(root, nr);
4299 btrfs_unreserve_metadata_space(root, 5);
4300 if (drop_inode) { 4649 if (drop_inode) {
4301 inode_dec_link_count(inode); 4650 inode_dec_link_count(inode);
4302 iput(inode); 4651 iput(inode);
4303 } 4652 }
4304 btrfs_btree_balance_dirty(root, nr);
4305 return err; 4653 return err;
4306} 4654}
4307 4655
@@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4311 struct btrfs_trans_handle *trans; 4659 struct btrfs_trans_handle *trans;
4312 struct btrfs_root *root = BTRFS_I(dir)->root; 4660 struct btrfs_root *root = BTRFS_I(dir)->root;
4313 struct inode *inode = NULL; 4661 struct inode *inode = NULL;
4314 int err;
4315 int drop_inode = 0; 4662 int drop_inode = 0;
4663 int err;
4316 unsigned long nr = 0; 4664 unsigned long nr = 0;
4317 u64 objectid; 4665 u64 objectid;
4318 u64 index = 0; 4666 u64 index = 0;
4319 4667
4668 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4669 if (err)
4670 return err;
4320 /* 4671 /*
4321 * 2 for inode item and ref 4672 * 2 for inode item and ref
4322 * 2 for dir items 4673 * 2 for dir items
4323 * 1 for xattr if selinux is on 4674 * 1 for xattr if selinux is on
4324 */ 4675 */
4325 err = btrfs_reserve_metadata_space(root, 5); 4676 trans = btrfs_start_transaction(root, 5);
4326 if (err) 4677 if (IS_ERR(trans))
4327 return err; 4678 return PTR_ERR(trans);
4328 4679
4329 trans = btrfs_start_transaction(root, 1);
4330 if (!trans)
4331 goto fail;
4332 btrfs_set_trans_block_group(trans, dir); 4680 btrfs_set_trans_block_group(trans, dir);
4333 4681
4334 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4335 if (err) {
4336 err = -ENOSPC;
4337 goto out_unlock;
4338 }
4339
4340 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4682 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4341 dentry->d_name.len, 4683 dentry->d_name.len,
4342 dentry->d_parent->d_inode->i_ino, 4684 dentry->d_parent->d_inode->i_ino,
@@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4368out_unlock: 4710out_unlock:
4369 nr = trans->blocks_used; 4711 nr = trans->blocks_used;
4370 btrfs_end_transaction_throttle(trans, root); 4712 btrfs_end_transaction_throttle(trans, root);
4371fail:
4372 btrfs_unreserve_metadata_space(root, 5);
4373 if (drop_inode) { 4713 if (drop_inode) {
4374 inode_dec_link_count(inode); 4714 inode_dec_link_count(inode);
4375 iput(inode); 4715 iput(inode);
@@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4396 if (root->objectid != BTRFS_I(inode)->root->objectid) 4736 if (root->objectid != BTRFS_I(inode)->root->objectid)
4397 return -EPERM; 4737 return -EPERM;
4398 4738
4399 /*
4400 * 1 item for inode ref
4401 * 2 items for dir items
4402 */
4403 err = btrfs_reserve_metadata_space(root, 3);
4404 if (err)
4405 return err;
4406
4407 btrfs_inc_nlink(inode); 4739 btrfs_inc_nlink(inode);
4408 4740
4409 err = btrfs_set_inode_index(dir, &index); 4741 err = btrfs_set_inode_index(dir, &index);
4410 if (err) 4742 if (err)
4411 goto fail; 4743 goto fail;
4412 4744
4413 trans = btrfs_start_transaction(root, 1); 4745 /*
4746 * 1 item for inode ref
4747 * 2 items for dir items
4748 */
4749 trans = btrfs_start_transaction(root, 3);
4750 if (IS_ERR(trans)) {
4751 err = PTR_ERR(trans);
4752 goto fail;
4753 }
4414 4754
4415 btrfs_set_trans_block_group(trans, dir); 4755 btrfs_set_trans_block_group(trans, dir);
4416 atomic_inc(&inode->i_count); 4756 atomic_inc(&inode->i_count);
@@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4429 nr = trans->blocks_used; 4769 nr = trans->blocks_used;
4430 btrfs_end_transaction_throttle(trans, root); 4770 btrfs_end_transaction_throttle(trans, root);
4431fail: 4771fail:
4432 btrfs_unreserve_metadata_space(root, 3);
4433 if (drop_inode) { 4772 if (drop_inode) {
4434 inode_dec_link_count(inode); 4773 inode_dec_link_count(inode);
4435 iput(inode); 4774 iput(inode);
@@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4449 u64 index = 0; 4788 u64 index = 0;
4450 unsigned long nr = 1; 4789 unsigned long nr = 1;
4451 4790
4791 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4792 if (err)
4793 return err;
4794
4452 /* 4795 /*
4453 * 2 items for inode and ref 4796 * 2 items for inode and ref
4454 * 2 items for dir items 4797 * 2 items for dir items
4455 * 1 for xattr if selinux is on 4798 * 1 for xattr if selinux is on
4456 */ 4799 */
4457 err = btrfs_reserve_metadata_space(root, 5); 4800 trans = btrfs_start_transaction(root, 5);
4458 if (err) 4801 if (IS_ERR(trans))
4459 return err; 4802 return PTR_ERR(trans);
4460
4461 trans = btrfs_start_transaction(root, 1);
4462 if (!trans) {
4463 err = -ENOMEM;
4464 goto out_unlock;
4465 }
4466 btrfs_set_trans_block_group(trans, dir); 4803 btrfs_set_trans_block_group(trans, dir);
4467 4804
4468 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4469 if (err) {
4470 err = -ENOSPC;
4471 goto out_fail;
4472 }
4473
4474 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4805 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4475 dentry->d_name.len, 4806 dentry->d_name.len,
4476 dentry->d_parent->d_inode->i_ino, objectid, 4807 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4510out_fail: 4841out_fail:
4511 nr = trans->blocks_used; 4842 nr = trans->blocks_used;
4512 btrfs_end_transaction_throttle(trans, root); 4843 btrfs_end_transaction_throttle(trans, root);
4513
4514out_unlock:
4515 btrfs_unreserve_metadata_space(root, 5);
4516 if (drop_on_err) 4844 if (drop_on_err)
4517 iput(inode); 4845 iput(inode);
4518 btrfs_btree_balance_dirty(root, nr); 4846 btrfs_btree_balance_dirty(root, nr);
@@ -4770,6 +5098,7 @@ again:
4770 } 5098 }
4771 flush_dcache_page(page); 5099 flush_dcache_page(page);
4772 } else if (create && PageUptodate(page)) { 5100 } else if (create && PageUptodate(page)) {
5101 WARN_ON(1);
4773 if (!trans) { 5102 if (!trans) {
4774 kunmap(page); 5103 kunmap(page);
4775 free_extent_map(em); 5104 free_extent_map(em);
@@ -4866,11 +5195,651 @@ out:
4866 return em; 5195 return em;
4867} 5196}
4868 5197
5198static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5199 u64 start, u64 len)
5200{
5201 struct btrfs_root *root = BTRFS_I(inode)->root;
5202 struct btrfs_trans_handle *trans;
5203 struct extent_map *em;
5204 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5205 struct btrfs_key ins;
5206 u64 alloc_hint;
5207 int ret;
5208
5209 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5210
5211 trans = btrfs_join_transaction(root, 0);
5212 if (!trans)
5213 return ERR_PTR(-ENOMEM);
5214
5215 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5216
5217 alloc_hint = get_extent_allocation_hint(inode, start, len);
5218 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5219 alloc_hint, (u64)-1, &ins, 1);
5220 if (ret) {
5221 em = ERR_PTR(ret);
5222 goto out;
5223 }
5224
5225 em = alloc_extent_map(GFP_NOFS);
5226 if (!em) {
5227 em = ERR_PTR(-ENOMEM);
5228 goto out;
5229 }
5230
5231 em->start = start;
5232 em->orig_start = em->start;
5233 em->len = ins.offset;
5234
5235 em->block_start = ins.objectid;
5236 em->block_len = ins.offset;
5237 em->bdev = root->fs_info->fs_devices->latest_bdev;
5238 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5239
5240 while (1) {
5241 write_lock(&em_tree->lock);
5242 ret = add_extent_mapping(em_tree, em);
5243 write_unlock(&em_tree->lock);
5244 if (ret != -EEXIST)
5245 break;
5246 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5247 }
5248
5249 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5250 ins.offset, ins.offset, 0);
5251 if (ret) {
5252 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5253 em = ERR_PTR(ret);
5254 }
5255out:
5256 btrfs_end_transaction(trans, root);
5257 return em;
5258}
5259
5260/*
5261 * returns 1 when the nocow is safe, < 1 on error, 0 if the
5262 * block must be cow'd
5263 */
5264static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5265 struct inode *inode, u64 offset, u64 len)
5266{
5267 struct btrfs_path *path;
5268 int ret;
5269 struct extent_buffer *leaf;
5270 struct btrfs_root *root = BTRFS_I(inode)->root;
5271 struct btrfs_file_extent_item *fi;
5272 struct btrfs_key key;
5273 u64 disk_bytenr;
5274 u64 backref_offset;
5275 u64 extent_end;
5276 u64 num_bytes;
5277 int slot;
5278 int found_type;
5279
5280 path = btrfs_alloc_path();
5281 if (!path)
5282 return -ENOMEM;
5283
5284 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
5285 offset, 0);
5286 if (ret < 0)
5287 goto out;
5288
5289 slot = path->slots[0];
5290 if (ret == 1) {
5291 if (slot == 0) {
5292 /* can't find the item, must cow */
5293 ret = 0;
5294 goto out;
5295 }
5296 slot--;
5297 }
5298 ret = 0;
5299 leaf = path->nodes[0];
5300 btrfs_item_key_to_cpu(leaf, &key, slot);
5301 if (key.objectid != inode->i_ino ||
5302 key.type != BTRFS_EXTENT_DATA_KEY) {
5303 /* not our file or wrong item type, must cow */
5304 goto out;
5305 }
5306
5307 if (key.offset > offset) {
5308 /* Wrong offset, must cow */
5309 goto out;
5310 }
5311
5312 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5313 found_type = btrfs_file_extent_type(leaf, fi);
5314 if (found_type != BTRFS_FILE_EXTENT_REG &&
5315 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5316 /* not a regular extent, must cow */
5317 goto out;
5318 }
5319 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5320 backref_offset = btrfs_file_extent_offset(leaf, fi);
5321
5322 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5323 if (extent_end < offset + len) {
5324 /* extent doesn't include our full range, must cow */
5325 goto out;
5326 }
5327
5328 if (btrfs_extent_readonly(root, disk_bytenr))
5329 goto out;
5330
5331 /*
5332 * look for other files referencing this extent, if we
5333 * find any we must cow
5334 */
5335 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
5336 key.offset - backref_offset, disk_bytenr))
5337 goto out;
5338
5339 /*
5340 * adjust disk_bytenr and num_bytes to cover just the bytes
5341 * in this extent we are about to write. If there
5342 * are any csums in that range we have to cow in order
5343 * to keep the csums correct
5344 */
5345 disk_bytenr += backref_offset;
5346 disk_bytenr += offset - key.offset;
5347 num_bytes = min(offset + len, extent_end) - offset;
5348 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5349 goto out;
5350 /*
5351 * all of the above have passed, it is safe to overwrite this extent
5352 * without cow
5353 */
5354 ret = 1;
5355out:
5356 btrfs_free_path(path);
5357 return ret;
5358}
5359
5360static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5361 struct buffer_head *bh_result, int create)
5362{
5363 struct extent_map *em;
5364 struct btrfs_root *root = BTRFS_I(inode)->root;
5365 u64 start = iblock << inode->i_blkbits;
5366 u64 len = bh_result->b_size;
5367 struct btrfs_trans_handle *trans;
5368
5369 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5370 if (IS_ERR(em))
5371 return PTR_ERR(em);
5372
5373 /*
5374 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5375 * io. INLINE is special, and we could probably kludge it in here, but
5376 * it's still buffered so for safety lets just fall back to the generic
5377 * buffered path.
5378 *
5379 * For COMPRESSED we _have_ to read the entire extent in so we can
5380 * decompress it, so there will be buffering required no matter what we
5381 * do, so go ahead and fallback to buffered.
5382 *
5383 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5384 * to buffered IO. Don't blame me, this is the price we pay for using
5385 * the generic code.
5386 */
5387 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5388 em->block_start == EXTENT_MAP_INLINE) {
5389 free_extent_map(em);
5390 return -ENOTBLK;
5391 }
5392
5393 /* Just a good old fashioned hole, return */
5394 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5395 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5396 free_extent_map(em);
5397 /* DIO will do one hole at a time, so just unlock a sector */
5398 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5399 start + root->sectorsize - 1, GFP_NOFS);
5400 return 0;
5401 }
5402
5403 /*
5404 * We don't allocate a new extent in the following cases
5405 *
5406 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5407 * existing extent.
5408 * 2) The extent is marked as PREALLOC. We're good to go here and can
5409 * just use the extent.
5410 *
5411 */
5412 if (!create) {
5413 len = em->len - (start - em->start);
5414 goto map;
5415 }
5416
5417 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5418 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5419 em->block_start != EXTENT_MAP_HOLE)) {
5420 int type;
5421 int ret;
5422 u64 block_start;
5423
5424 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5425 type = BTRFS_ORDERED_PREALLOC;
5426 else
5427 type = BTRFS_ORDERED_NOCOW;
5428 len = min(len, em->len - (start - em->start));
5429 block_start = em->block_start + (start - em->start);
5430
5431 /*
5432 * we're not going to log anything, but we do need
5433 * to make sure the current transaction stays open
5434 * while we look for nocow cross refs
5435 */
5436 trans = btrfs_join_transaction(root, 0);
5437 if (!trans)
5438 goto must_cow;
5439
5440 if (can_nocow_odirect(trans, inode, start, len) == 1) {
5441 ret = btrfs_add_ordered_extent_dio(inode, start,
5442 block_start, len, len, type);
5443 btrfs_end_transaction(trans, root);
5444 if (ret) {
5445 free_extent_map(em);
5446 return ret;
5447 }
5448 goto unlock;
5449 }
5450 btrfs_end_transaction(trans, root);
5451 }
5452must_cow:
5453 /*
5454 * this will cow the extent, reset the len in case we changed
5455 * it above
5456 */
5457 len = bh_result->b_size;
5458 free_extent_map(em);
5459 em = btrfs_new_extent_direct(inode, start, len);
5460 if (IS_ERR(em))
5461 return PTR_ERR(em);
5462 len = min(len, em->len - (start - em->start));
5463unlock:
5464 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5465 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5466 0, NULL, GFP_NOFS);
5467map:
5468 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5469 inode->i_blkbits;
5470 bh_result->b_size = len;
5471 bh_result->b_bdev = em->bdev;
5472 set_buffer_mapped(bh_result);
5473 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5474 set_buffer_new(bh_result);
5475
5476 free_extent_map(em);
5477
5478 return 0;
5479}
5480
5481struct btrfs_dio_private {
5482 struct inode *inode;
5483 u64 logical_offset;
5484 u64 disk_bytenr;
5485 u64 bytes;
5486 u32 *csums;
5487 void *private;
5488};
5489
5490static void btrfs_endio_direct_read(struct bio *bio, int err)
5491{
5492 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5493 struct bio_vec *bvec = bio->bi_io_vec;
5494 struct btrfs_dio_private *dip = bio->bi_private;
5495 struct inode *inode = dip->inode;
5496 struct btrfs_root *root = BTRFS_I(inode)->root;
5497 u64 start;
5498 u32 *private = dip->csums;
5499
5500 start = dip->logical_offset;
5501 do {
5502 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5503 struct page *page = bvec->bv_page;
5504 char *kaddr;
5505 u32 csum = ~(u32)0;
5506 unsigned long flags;
5507
5508 local_irq_save(flags);
5509 kaddr = kmap_atomic(page, KM_IRQ0);
5510 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5511 csum, bvec->bv_len);
5512 btrfs_csum_final(csum, (char *)&csum);
5513 kunmap_atomic(kaddr, KM_IRQ0);
5514 local_irq_restore(flags);
5515
5516 flush_dcache_page(bvec->bv_page);
5517 if (csum != *private) {
5518 printk(KERN_ERR "btrfs csum failed ino %lu off"
5519 " %llu csum %u private %u\n",
5520 inode->i_ino, (unsigned long long)start,
5521 csum, *private);
5522 err = -EIO;
5523 }
5524 }
5525
5526 start += bvec->bv_len;
5527 private++;
5528 bvec++;
5529 } while (bvec <= bvec_end);
5530
5531 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5532 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5533 bio->bi_private = dip->private;
5534
5535 kfree(dip->csums);
5536 kfree(dip);
5537 dio_end_io(bio, err);
5538}
5539
5540static void btrfs_endio_direct_write(struct bio *bio, int err)
5541{
5542 struct btrfs_dio_private *dip = bio->bi_private;
5543 struct inode *inode = dip->inode;
5544 struct btrfs_root *root = BTRFS_I(inode)->root;
5545 struct btrfs_trans_handle *trans;
5546 struct btrfs_ordered_extent *ordered = NULL;
5547 struct extent_state *cached_state = NULL;
5548 int ret;
5549
5550 if (err)
5551 goto out_done;
5552
5553 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5554 dip->logical_offset, dip->bytes);
5555 if (!ret)
5556 goto out_done;
5557
5558 BUG_ON(!ordered);
5559
5560 trans = btrfs_join_transaction(root, 1);
5561 if (!trans) {
5562 err = -ENOMEM;
5563 goto out;
5564 }
5565 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5566
5567 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5568 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5569 if (!ret)
5570 ret = btrfs_update_inode(trans, root, inode);
5571 err = ret;
5572 goto out;
5573 }
5574
5575 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5576 ordered->file_offset + ordered->len - 1, 0,
5577 &cached_state, GFP_NOFS);
5578
5579 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5580 ret = btrfs_mark_extent_written(trans, inode,
5581 ordered->file_offset,
5582 ordered->file_offset +
5583 ordered->len);
5584 if (ret) {
5585 err = ret;
5586 goto out_unlock;
5587 }
5588 } else {
5589 ret = insert_reserved_file_extent(trans, inode,
5590 ordered->file_offset,
5591 ordered->start,
5592 ordered->disk_len,
5593 ordered->len,
5594 ordered->len,
5595 0, 0, 0,
5596 BTRFS_FILE_EXTENT_REG);
5597 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5598 ordered->file_offset, ordered->len);
5599 if (ret) {
5600 err = ret;
5601 WARN_ON(1);
5602 goto out_unlock;
5603 }
5604 }
5605
5606 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5607 btrfs_ordered_update_i_size(inode, 0, ordered);
5608 btrfs_update_inode(trans, root, inode);
5609out_unlock:
5610 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5611 ordered->file_offset + ordered->len - 1,
5612 &cached_state, GFP_NOFS);
5613out:
5614 btrfs_delalloc_release_metadata(inode, ordered->len);
5615 btrfs_end_transaction(trans, root);
5616 btrfs_put_ordered_extent(ordered);
5617 btrfs_put_ordered_extent(ordered);
5618out_done:
5619 bio->bi_private = dip->private;
5620
5621 kfree(dip->csums);
5622 kfree(dip);
5623 dio_end_io(bio, err);
5624}
5625
5626static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5627 struct bio *bio, int mirror_num,
5628 unsigned long bio_flags, u64 offset)
5629{
5630 int ret;
5631 struct btrfs_root *root = BTRFS_I(inode)->root;
5632 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5633 BUG_ON(ret);
5634 return 0;
5635}
5636
5637static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5638 loff_t file_offset)
5639{
5640 struct btrfs_root *root = BTRFS_I(inode)->root;
5641 struct btrfs_dio_private *dip;
5642 struct bio_vec *bvec = bio->bi_io_vec;
5643 u64 start;
5644 int skip_sum;
5645 int write = rw & (1 << BIO_RW);
5646 int ret = 0;
5647
5648 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5649
5650 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5651 if (!dip) {
5652 ret = -ENOMEM;
5653 goto free_ordered;
5654 }
5655 dip->csums = NULL;
5656
5657 if (!skip_sum) {
5658 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5659 if (!dip->csums) {
5660 ret = -ENOMEM;
5661 goto free_ordered;
5662 }
5663 }
5664
5665 dip->private = bio->bi_private;
5666 dip->inode = inode;
5667 dip->logical_offset = file_offset;
5668
5669 start = dip->logical_offset;
5670 dip->bytes = 0;
5671 do {
5672 dip->bytes += bvec->bv_len;
5673 bvec++;
5674 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5675
5676 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5677 bio->bi_private = dip;
5678
5679 if (write)
5680 bio->bi_end_io = btrfs_endio_direct_write;
5681 else
5682 bio->bi_end_io = btrfs_endio_direct_read;
5683
5684 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5685 if (ret)
5686 goto out_err;
5687
5688 if (write && !skip_sum) {
5689 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5690 inode, rw, bio, 0, 0,
5691 dip->logical_offset,
5692 __btrfs_submit_bio_start_direct_io,
5693 __btrfs_submit_bio_done);
5694 if (ret)
5695 goto out_err;
5696 return;
5697 } else if (!skip_sum)
5698 btrfs_lookup_bio_sums_dio(root, inode, bio,
5699 dip->logical_offset, dip->csums);
5700
5701 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5702 if (ret)
5703 goto out_err;
5704 return;
5705out_err:
5706 kfree(dip->csums);
5707 kfree(dip);
5708free_ordered:
5709 /*
5710 * If this is a write, we need to clean up the reserved space and kill
5711 * the ordered extent.
5712 */
5713 if (write) {
5714 struct btrfs_ordered_extent *ordered;
5715 ordered = btrfs_lookup_ordered_extent(inode,
5716 dip->logical_offset);
5717 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5718 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5719 btrfs_free_reserved_extent(root, ordered->start,
5720 ordered->disk_len);
5721 btrfs_put_ordered_extent(ordered);
5722 btrfs_put_ordered_extent(ordered);
5723 }
5724 bio_endio(bio, ret);
5725}
5726
5727static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
5728 const struct iovec *iov, loff_t offset,
5729 unsigned long nr_segs)
5730{
5731 int seg;
5732 size_t size;
5733 unsigned long addr;
5734 unsigned blocksize_mask = root->sectorsize - 1;
5735 ssize_t retval = -EINVAL;
5736 loff_t end = offset;
5737
5738 if (offset & blocksize_mask)
5739 goto out;
5740
5741 /* Check the memory alignment. Blocks cannot straddle pages */
5742 for (seg = 0; seg < nr_segs; seg++) {
5743 addr = (unsigned long)iov[seg].iov_base;
5744 size = iov[seg].iov_len;
5745 end += size;
5746 if ((addr & blocksize_mask) || (size & blocksize_mask))
5747 goto out;
5748 }
5749 retval = 0;
5750out:
5751 return retval;
5752}
4869static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5753static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4870 const struct iovec *iov, loff_t offset, 5754 const struct iovec *iov, loff_t offset,
4871 unsigned long nr_segs) 5755 unsigned long nr_segs)
4872{ 5756{
4873 return -EINVAL; 5757 struct file *file = iocb->ki_filp;
5758 struct inode *inode = file->f_mapping->host;
5759 struct btrfs_ordered_extent *ordered;
5760 struct extent_state *cached_state = NULL;
5761 u64 lockstart, lockend;
5762 ssize_t ret;
5763 int writing = rw & WRITE;
5764 int write_bits = 0;
5765 size_t count = iov_length(iov, nr_segs);
5766
5767 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
5768 offset, nr_segs)) {
5769 return 0;
5770 }
5771
5772 lockstart = offset;
5773 lockend = offset + count - 1;
5774
5775 if (writing) {
5776 ret = btrfs_delalloc_reserve_space(inode, count);
5777 if (ret)
5778 goto out;
5779 }
5780
5781 while (1) {
5782 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5783 0, &cached_state, GFP_NOFS);
5784 /*
5785 * We're concerned with the entire range that we're going to be
5786 * doing DIO to, so we need to make sure theres no ordered
5787 * extents in this range.
5788 */
5789 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5790 lockend - lockstart + 1);
5791 if (!ordered)
5792 break;
5793 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5794 &cached_state, GFP_NOFS);
5795 btrfs_start_ordered_extent(inode, ordered, 1);
5796 btrfs_put_ordered_extent(ordered);
5797 cond_resched();
5798 }
5799
5800 /*
5801 * we don't use btrfs_set_extent_delalloc because we don't want
5802 * the dirty or uptodate bits
5803 */
5804 if (writing) {
5805 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
5806 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5807 EXTENT_DELALLOC, 0, NULL, &cached_state,
5808 GFP_NOFS);
5809 if (ret) {
5810 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5811 lockend, EXTENT_LOCKED | write_bits,
5812 1, 0, &cached_state, GFP_NOFS);
5813 goto out;
5814 }
5815 }
5816
5817 free_extent_state(cached_state);
5818 cached_state = NULL;
5819
5820 ret = __blockdev_direct_IO(rw, iocb, inode,
5821 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
5822 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
5823 btrfs_submit_direct, 0);
5824
5825 if (ret < 0 && ret != -EIOCBQUEUED) {
5826 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
5827 offset + iov_length(iov, nr_segs) - 1,
5828 EXTENT_LOCKED | write_bits, 1, 0,
5829 &cached_state, GFP_NOFS);
5830 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5831 /*
5832 * We're falling back to buffered, unlock the section we didn't
5833 * do IO on.
5834 */
5835 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
5836 offset + iov_length(iov, nr_segs) - 1,
5837 EXTENT_LOCKED | write_bits, 1, 0,
5838 &cached_state, GFP_NOFS);
5839 }
5840out:
5841 free_extent_state(cached_state);
5842 return ret;
4874} 5843}
4875 5844
4876static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5845static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5034 u64 page_start; 6003 u64 page_start;
5035 u64 page_end; 6004 u64 page_end;
5036 6005
5037 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 6006 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5038 if (ret) { 6007 if (ret) {
5039 if (ret == -ENOMEM) 6008 if (ret == -ENOMEM)
5040 ret = VM_FAULT_OOM; 6009 ret = VM_FAULT_OOM;
@@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5043 goto out; 6012 goto out;
5044 } 6013 }
5045 6014
5046 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5047 if (ret) {
5048 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5049 ret = VM_FAULT_SIGBUS;
5050 goto out;
5051 }
5052
5053 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6015 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5054again: 6016again:
5055 lock_page(page); 6017 lock_page(page);
@@ -5059,7 +6021,6 @@ again:
5059 6021
5060 if ((page->mapping != inode->i_mapping) || 6022 if ((page->mapping != inode->i_mapping) ||
5061 (page_start >= size)) { 6023 (page_start >= size)) {
5062 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5063 /* page got truncated out from underneath us */ 6024 /* page got truncated out from underneath us */
5064 goto out_unlock; 6025 goto out_unlock;
5065 } 6026 }
@@ -5100,7 +6061,6 @@ again:
5100 unlock_extent_cached(io_tree, page_start, page_end, 6061 unlock_extent_cached(io_tree, page_start, page_end,
5101 &cached_state, GFP_NOFS); 6062 &cached_state, GFP_NOFS);
5102 ret = VM_FAULT_SIGBUS; 6063 ret = VM_FAULT_SIGBUS;
5103 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5104 goto out_unlock; 6064 goto out_unlock;
5105 } 6065 }
5106 ret = 0; 6066 ret = 0;
@@ -5127,10 +6087,10 @@ again:
5127 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6087 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5128 6088
5129out_unlock: 6089out_unlock:
5130 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5131 if (!ret) 6090 if (!ret)
5132 return VM_FAULT_LOCKED; 6091 return VM_FAULT_LOCKED;
5133 unlock_page(page); 6092 unlock_page(page);
6093 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
5134out: 6094out:
5135 return ret; 6095 return ret;
5136} 6096}
@@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
5155 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6115 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5156 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6116 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
5157 6117
5158 trans = btrfs_start_transaction(root, 1); 6118 trans = btrfs_start_transaction(root, 0);
6119 BUG_ON(IS_ERR(trans));
5159 btrfs_set_trans_block_group(trans, inode); 6120 btrfs_set_trans_block_group(trans, inode);
6121 trans->block_rsv = root->orphan_block_rsv;
5160 6122
5161 /* 6123 /*
5162 * setattr is responsible for setting the ordered_data_close flag, 6124 * setattr is responsible for setting the ordered_data_close flag,
@@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
5179 btrfs_add_ordered_operation(trans, root, inode); 6141 btrfs_add_ordered_operation(trans, root, inode);
5180 6142
5181 while (1) { 6143 while (1) {
6144 if (!trans) {
6145 trans = btrfs_start_transaction(root, 0);
6146 BUG_ON(IS_ERR(trans));
6147 btrfs_set_trans_block_group(trans, inode);
6148 trans->block_rsv = root->orphan_block_rsv;
6149 }
6150
6151 ret = btrfs_block_rsv_check(trans, root,
6152 root->orphan_block_rsv, 0, 5);
6153 if (ret) {
6154 BUG_ON(ret != -EAGAIN);
6155 ret = btrfs_commit_transaction(trans, root);
6156 BUG_ON(ret);
6157 trans = NULL;
6158 continue;
6159 }
6160
5182 ret = btrfs_truncate_inode_items(trans, root, inode, 6161 ret = btrfs_truncate_inode_items(trans, root, inode,
5183 inode->i_size, 6162 inode->i_size,
5184 BTRFS_EXTENT_DATA_KEY); 6163 BTRFS_EXTENT_DATA_KEY);
@@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
5190 6169
5191 nr = trans->blocks_used; 6170 nr = trans->blocks_used;
5192 btrfs_end_transaction(trans, root); 6171 btrfs_end_transaction(trans, root);
6172 trans = NULL;
5193 btrfs_btree_balance_dirty(root, nr); 6173 btrfs_btree_balance_dirty(root, nr);
5194
5195 trans = btrfs_start_transaction(root, 1);
5196 btrfs_set_trans_block_group(trans, inode);
5197 } 6174 }
5198 6175
5199 if (ret == 0 && inode->i_nlink > 0) { 6176 if (ret == 0 && inode->i_nlink > 0) {
@@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
5254struct inode *btrfs_alloc_inode(struct super_block *sb) 6231struct inode *btrfs_alloc_inode(struct super_block *sb)
5255{ 6232{
5256 struct btrfs_inode *ei; 6233 struct btrfs_inode *ei;
6234 struct inode *inode;
5257 6235
5258 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6236 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5259 if (!ei) 6237 if (!ei)
5260 return NULL; 6238 return NULL;
6239
6240 ei->root = NULL;
6241 ei->space_info = NULL;
6242 ei->generation = 0;
6243 ei->sequence = 0;
5261 ei->last_trans = 0; 6244 ei->last_trans = 0;
5262 ei->last_sub_trans = 0; 6245 ei->last_sub_trans = 0;
5263 ei->logged_trans = 0; 6246 ei->logged_trans = 0;
5264 ei->outstanding_extents = 0; 6247 ei->delalloc_bytes = 0;
5265 ei->reserved_extents = 0; 6248 ei->reserved_bytes = 0;
5266 ei->root = NULL; 6249 ei->disk_i_size = 0;
6250 ei->flags = 0;
6251 ei->index_cnt = (u64)-1;
6252 ei->last_unlink_trans = 0;
6253
5267 spin_lock_init(&ei->accounting_lock); 6254 spin_lock_init(&ei->accounting_lock);
6255 atomic_set(&ei->outstanding_extents, 0);
6256 ei->reserved_extents = 0;
6257
6258 ei->ordered_data_close = 0;
6259 ei->orphan_meta_reserved = 0;
6260 ei->dummy_inode = 0;
6261 ei->force_compress = 0;
6262
6263 inode = &ei->vfs_inode;
6264 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
6265 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
6266 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
6267 mutex_init(&ei->log_mutex);
5268 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6268 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5269 INIT_LIST_HEAD(&ei->i_orphan); 6269 INIT_LIST_HEAD(&ei->i_orphan);
6270 INIT_LIST_HEAD(&ei->delalloc_inodes);
5270 INIT_LIST_HEAD(&ei->ordered_operations); 6271 INIT_LIST_HEAD(&ei->ordered_operations);
5271 return &ei->vfs_inode; 6272 RB_CLEAR_NODE(&ei->rb_node);
6273
6274 return inode;
5272} 6275}
5273 6276
5274void btrfs_destroy_inode(struct inode *inode) 6277void btrfs_destroy_inode(struct inode *inode)
@@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
5278 6281
5279 WARN_ON(!list_empty(&inode->i_dentry)); 6282 WARN_ON(!list_empty(&inode->i_dentry));
5280 WARN_ON(inode->i_data.nrpages); 6283 WARN_ON(inode->i_data.nrpages);
6284 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6285 WARN_ON(BTRFS_I(inode)->reserved_extents);
5281 6286
5282 /* 6287 /*
5283 * This can happen where we create an inode, but somebody else also 6288 * This can happen where we create an inode, but somebody else also
@@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
5298 spin_unlock(&root->fs_info->ordered_extent_lock); 6303 spin_unlock(&root->fs_info->ordered_extent_lock);
5299 } 6304 }
5300 6305
5301 spin_lock(&root->list_lock); 6306 spin_lock(&root->orphan_lock);
5302 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6307 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5303 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6308 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5304 inode->i_ino); 6309 inode->i_ino);
5305 list_del_init(&BTRFS_I(inode)->i_orphan); 6310 list_del_init(&BTRFS_I(inode)->i_orphan);
5306 } 6311 }
5307 spin_unlock(&root->list_lock); 6312 spin_unlock(&root->orphan_lock);
5308 6313
5309 while (1) { 6314 while (1) {
5310 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6315 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5425 if (S_ISDIR(old_inode->i_mode) && new_inode && 6430 if (S_ISDIR(old_inode->i_mode) && new_inode &&
5426 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6431 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5427 return -ENOTEMPTY; 6432 return -ENOTEMPTY;
5428
5429 /*
5430 * We want to reserve the absolute worst case amount of items. So if
5431 * both inodes are subvols and we need to unlink them then that would
5432 * require 4 item modifications, but if they are both normal inodes it
5433 * would require 5 item modifications, so we'll assume their normal
5434 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5435 * should cover the worst case number of items we'll modify.
5436 */
5437 ret = btrfs_reserve_metadata_space(root, 11);
5438 if (ret)
5439 return ret;
5440
5441 /* 6433 /*
5442 * we're using rename to replace one file with another. 6434 * we're using rename to replace one file with another.
5443 * and the replacement file is large. Start IO on it now so 6435 * and the replacement file is large. Start IO on it now so
@@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5450 /* close the racy window with snapshot create/destroy ioctl */ 6442 /* close the racy window with snapshot create/destroy ioctl */
5451 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6443 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5452 down_read(&root->fs_info->subvol_sem); 6444 down_read(&root->fs_info->subvol_sem);
6445 /*
6446 * We want to reserve the absolute worst case amount of items. So if
6447 * both inodes are subvols and we need to unlink them then that would
6448 * require 4 item modifications, but if they are both normal inodes it
6449 * would require 5 item modifications, so we'll assume their normal
6450 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6451 * should cover the worst case number of items we'll modify.
6452 */
6453 trans = btrfs_start_transaction(root, 20);
6454 if (IS_ERR(trans))
6455 return PTR_ERR(trans);
5453 6456
5454 trans = btrfs_start_transaction(root, 1);
5455 btrfs_set_trans_block_group(trans, new_dir); 6457 btrfs_set_trans_block_group(trans, new_dir);
5456 6458
5457 if (dest != root) 6459 if (dest != root)
@@ -5550,7 +6552,6 @@ out_fail:
5550 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6552 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5551 up_read(&root->fs_info->subvol_sem); 6553 up_read(&root->fs_info->subvol_sem);
5552 6554
5553 btrfs_unreserve_metadata_space(root, 11);
5554 return ret; 6555 return ret;
5555} 6556}
5556 6557
@@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5602 return 0; 6603 return 0;
5603} 6604}
5604 6605
6606int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6607{
6608 struct btrfs_inode *binode;
6609 struct inode *inode = NULL;
6610
6611 spin_lock(&root->fs_info->delalloc_lock);
6612 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6613 binode = list_entry(root->fs_info->delalloc_inodes.next,
6614 struct btrfs_inode, delalloc_inodes);
6615 inode = igrab(&binode->vfs_inode);
6616 if (inode) {
6617 list_move_tail(&binode->delalloc_inodes,
6618 &root->fs_info->delalloc_inodes);
6619 break;
6620 }
6621
6622 list_del_init(&binode->delalloc_inodes);
6623 cond_resched_lock(&root->fs_info->delalloc_lock);
6624 }
6625 spin_unlock(&root->fs_info->delalloc_lock);
6626
6627 if (inode) {
6628 write_inode_now(inode, 0);
6629 if (delay_iput)
6630 btrfs_add_delayed_iput(inode);
6631 else
6632 iput(inode);
6633 return 1;
6634 }
6635 return 0;
6636}
6637
5605static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6638static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5606 const char *symname) 6639 const char *symname)
5607{ 6640{
@@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5625 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6658 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5626 return -ENAMETOOLONG; 6659 return -ENAMETOOLONG;
5627 6660
6661 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6662 if (err)
6663 return err;
5628 /* 6664 /*
5629 * 2 items for inode item and ref 6665 * 2 items for inode item and ref
5630 * 2 items for dir items 6666 * 2 items for dir items
5631 * 1 item for xattr if selinux is on 6667 * 1 item for xattr if selinux is on
5632 */ 6668 */
5633 err = btrfs_reserve_metadata_space(root, 5); 6669 trans = btrfs_start_transaction(root, 5);
5634 if (err) 6670 if (IS_ERR(trans))
5635 return err; 6671 return PTR_ERR(trans);
5636 6672
5637 trans = btrfs_start_transaction(root, 1);
5638 if (!trans)
5639 goto out_fail;
5640 btrfs_set_trans_block_group(trans, dir); 6673 btrfs_set_trans_block_group(trans, dir);
5641 6674
5642 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5643 if (err) {
5644 err = -ENOSPC;
5645 goto out_unlock;
5646 }
5647
5648 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6675 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5649 dentry->d_name.len, 6676 dentry->d_name.len,
5650 dentry->d_parent->d_inode->i_ino, objectid, 6677 dentry->d_parent->d_inode->i_ino, objectid,
@@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5716out_unlock: 6743out_unlock:
5717 nr = trans->blocks_used; 6744 nr = trans->blocks_used;
5718 btrfs_end_transaction_throttle(trans, root); 6745 btrfs_end_transaction_throttle(trans, root);
5719out_fail:
5720 btrfs_unreserve_metadata_space(root, 5);
5721 if (drop_inode) { 6746 if (drop_inode) {
5722 inode_dec_link_count(inode); 6747 inode_dec_link_count(inode);
5723 iput(inode); 6748 iput(inode);
@@ -5726,33 +6751,28 @@ out_fail:
5726 return err; 6751 return err;
5727} 6752}
5728 6753
5729static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 6754int btrfs_prealloc_file_range(struct inode *inode, int mode,
5730 u64 alloc_hint, int mode, loff_t actual_len) 6755 u64 start, u64 num_bytes, u64 min_size,
6756 loff_t actual_len, u64 *alloc_hint)
5731{ 6757{
5732 struct btrfs_trans_handle *trans; 6758 struct btrfs_trans_handle *trans;
5733 struct btrfs_root *root = BTRFS_I(inode)->root; 6759 struct btrfs_root *root = BTRFS_I(inode)->root;
5734 struct btrfs_key ins; 6760 struct btrfs_key ins;
5735 u64 cur_offset = start; 6761 u64 cur_offset = start;
5736 u64 num_bytes = end - start;
5737 int ret = 0; 6762 int ret = 0;
5738 u64 i_size;
5739 6763
5740 while (num_bytes > 0) { 6764 while (num_bytes > 0) {
5741 trans = btrfs_start_transaction(root, 1); 6765 trans = btrfs_start_transaction(root, 3);
5742 6766 if (IS_ERR(trans)) {
5743 ret = btrfs_reserve_extent(trans, root, num_bytes, 6767 ret = PTR_ERR(trans);
5744 root->sectorsize, 0, alloc_hint, 6768 break;
5745 (u64)-1, &ins, 1);
5746 if (ret) {
5747 WARN_ON(1);
5748 goto stop_trans;
5749 } 6769 }
5750 6770
5751 ret = btrfs_reserve_metadata_space(root, 3); 6771 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6772 0, *alloc_hint, (u64)-1, &ins, 1);
5752 if (ret) { 6773 if (ret) {
5753 btrfs_free_reserved_extent(root, ins.objectid, 6774 btrfs_end_transaction(trans, root);
5754 ins.offset); 6775 break;
5755 goto stop_trans;
5756 } 6776 }
5757 6777
5758 ret = insert_reserved_file_extent(trans, inode, 6778 ret = insert_reserved_file_extent(trans, inode,
@@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5766 6786
5767 num_bytes -= ins.offset; 6787 num_bytes -= ins.offset;
5768 cur_offset += ins.offset; 6788 cur_offset += ins.offset;
5769 alloc_hint = ins.objectid + ins.offset; 6789 *alloc_hint = ins.objectid + ins.offset;
5770 6790
5771 inode->i_ctime = CURRENT_TIME; 6791 inode->i_ctime = CURRENT_TIME;
5772 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 6792 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5773 if (!(mode & FALLOC_FL_KEEP_SIZE) && 6793 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5774 (actual_len > inode->i_size) && 6794 (actual_len > inode->i_size) &&
5775 (cur_offset > inode->i_size)) { 6795 (cur_offset > inode->i_size)) {
5776
5777 if (cur_offset > actual_len) 6796 if (cur_offset > actual_len)
5778 i_size = actual_len; 6797 i_size_write(inode, actual_len);
5779 else 6798 else
5780 i_size = cur_offset; 6799 i_size_write(inode, cur_offset);
5781 i_size_write(inode, i_size); 6800 i_size_write(inode, cur_offset);
5782 btrfs_ordered_update_i_size(inode, i_size, NULL); 6801 btrfs_ordered_update_i_size(inode, cur_offset, NULL);
5783 } 6802 }
5784 6803
5785 ret = btrfs_update_inode(trans, root, inode); 6804 ret = btrfs_update_inode(trans, root, inode);
5786 BUG_ON(ret); 6805 BUG_ON(ret);
5787 6806
5788 btrfs_end_transaction(trans, root); 6807 btrfs_end_transaction(trans, root);
5789 btrfs_unreserve_metadata_space(root, 3);
5790 } 6808 }
5791 return ret; 6809 return ret;
5792
5793stop_trans:
5794 btrfs_end_transaction(trans, root);
5795 return ret;
5796
5797} 6810}
5798 6811
5799static long btrfs_fallocate(struct inode *inode, int mode, 6812static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5826 goto out; 6839 goto out;
5827 } 6840 }
5828 6841
5829 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, 6842 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
5830 alloc_end - alloc_start);
5831 if (ret) 6843 if (ret)
5832 goto out; 6844 goto out;
5833 6845
@@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5872 if (em->block_start == EXTENT_MAP_HOLE || 6884 if (em->block_start == EXTENT_MAP_HOLE ||
5873 (cur_offset >= inode->i_size && 6885 (cur_offset >= inode->i_size &&
5874 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6886 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5875 ret = prealloc_file_range(inode, 6887 ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
5876 cur_offset, last_byte, 6888 last_byte - cur_offset,
5877 alloc_hint, mode, offset+len); 6889 1 << inode->i_blkbits,
6890 offset + len,
6891 &alloc_hint);
5878 if (ret < 0) { 6892 if (ret < 0) {
5879 free_extent_map(em); 6893 free_extent_map(em);
5880 break; 6894 break;
5881 } 6895 }
5882 } 6896 }
5883 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5884 alloc_hint = em->block_start;
5885 free_extent_map(em); 6897 free_extent_map(em);
5886 6898
5887 cur_offset = last_byte; 6899 cur_offset = last_byte;
@@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5893 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 6905 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5894 &cached_state, GFP_NOFS); 6906 &cached_state, GFP_NOFS);
5895 6907
5896 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 6908 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
5897 alloc_end - alloc_start);
5898out: 6909out:
5899 mutex_unlock(&inode->i_mutex); 6910 mutex_unlock(&inode->i_mutex);
5900 return ret; 6911 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..4cdb98cf26de 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
240 u64 index = 0; 240 u64 index = 0;
241 241
242 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
243 0, &objectid);
244 if (ret)
245 return ret;
242 /* 246 /*
243 * 1 - inode item 247 * 1 - inode item
244 * 2 - refs 248 * 2 - refs
245 * 1 - root item 249 * 1 - root item
246 * 2 - dir items 250 * 2 - dir items
247 */ 251 */
248 ret = btrfs_reserve_metadata_space(root, 6); 252 trans = btrfs_start_transaction(root, 6);
249 if (ret) 253 if (IS_ERR(trans))
250 return ret; 254 return PTR_ERR(trans);
251
252 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(!trans);
254
255 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
256 0, &objectid);
257 if (ret)
258 goto fail;
259 255
260 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 256 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
261 0, objectid, NULL, 0, 0, 0); 257 0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
345 err = btrfs_commit_transaction(trans, root); 341 err = btrfs_commit_transaction(trans, root);
346 if (err && !ret) 342 if (err && !ret)
347 ret = err; 343 ret = err;
348
349 btrfs_unreserve_metadata_space(root, 6);
350 return ret; 344 return ret;
351} 345}
352 346
353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
354 char *name, int namelen)
355{ 348{
356 struct inode *inode; 349 struct inode *inode;
357 struct btrfs_pending_snapshot *pending_snapshot; 350 struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
361 if (!root->ref_cows) 354 if (!root->ref_cows)
362 return -EINVAL; 355 return -EINVAL;
363 356
364 /*
365 * 1 - inode item
366 * 2 - refs
367 * 1 - root item
368 * 2 - dir items
369 */
370 ret = btrfs_reserve_metadata_space(root, 6);
371 if (ret)
372 goto fail;
373
374 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 357 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
375 if (!pending_snapshot) { 358 if (!pending_snapshot)
376 ret = -ENOMEM; 359 return -ENOMEM;
377 btrfs_unreserve_metadata_space(root, 6); 360
378 goto fail; 361 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 }
380 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
381 if (!pending_snapshot->name) {
382 ret = -ENOMEM;
383 kfree(pending_snapshot);
384 btrfs_unreserve_metadata_space(root, 6);
385 goto fail;
386 }
387 memcpy(pending_snapshot->name, name, namelen);
388 pending_snapshot->name[namelen] = '\0';
389 pending_snapshot->dentry = dentry; 362 pending_snapshot->dentry = dentry;
390 trans = btrfs_start_transaction(root, 1);
391 BUG_ON(!trans);
392 pending_snapshot->root = root; 363 pending_snapshot->root = root;
364
365 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
366 if (IS_ERR(trans)) {
367 ret = PTR_ERR(trans);
368 goto fail;
369 }
370
371 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
372 BUG_ON(ret);
373
393 list_add(&pending_snapshot->list, 374 list_add(&pending_snapshot->list,
394 &trans->transaction->pending_snapshots); 375 &trans->transaction->pending_snapshots);
395 ret = btrfs_commit_transaction(trans, root); 376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
396 BUG_ON(ret); 377 BUG_ON(ret);
397 btrfs_unreserve_metadata_space(root, 6); 378
379 ret = pending_snapshot->error;
380 if (ret)
381 goto fail;
382
383 btrfs_orphan_cleanup(pending_snapshot->snap);
398 384
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 385 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
400 if (IS_ERR(inode)) { 386 if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
405 d_instantiate(dentry, inode); 391 d_instantiate(dentry, inode);
406 ret = 0; 392 ret = 0;
407fail: 393fail:
394 kfree(pending_snapshot);
408 return ret; 395 return ret;
409} 396}
410 397
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
456 goto out_up_read; 443 goto out_up_read;
457 444
458 if (snap_src) { 445 if (snap_src) {
459 error = create_snapshot(snap_src, dentry, 446 error = create_snapshot(snap_src, dentry);
460 name, namelen);
461 } else { 447 } else {
462 error = create_subvol(BTRFS_I(dir)->root, dentry, 448 error = create_subvol(BTRFS_I(dir)->root, dentry,
463 name, namelen); 449 name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 587 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1; 588 BTRFS_I(inode)->force_compress = 1;
603 589
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 590 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
605 if (ret) { 591 if (ret)
606 ret = -ENOSPC; 592 goto err_unlock;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
617again: 593again:
618 if (inode->i_size == 0 || 594 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { 595 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
622 } 598 }
623 599
624 page = grab_cache_page(inode->i_mapping, i); 600 page = grab_cache_page(inode->i_mapping, i);
625 if (!page) 601 if (!page) {
602 ret = -ENOMEM;
626 goto err_reservations; 603 goto err_reservations;
604 }
627 605
628 if (!PageUptodate(page)) { 606 if (!PageUptodate(page)) {
629 btrfs_readpage(NULL, page); 607 btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
631 if (!PageUptodate(page)) { 609 if (!PageUptodate(page)) {
632 unlock_page(page); 610 unlock_page(page);
633 page_cache_release(page); 611 page_cache_release(page);
612 ret = -EIO;
634 goto err_reservations; 613 goto err_reservations;
635 } 614 }
636 } 615 }
@@ -644,8 +623,7 @@ again:
644 wait_on_page_writeback(page); 623 wait_on_page_writeback(page);
645 624
646 if (PageDirty(page)) { 625 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode, 626 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
648 PAGE_CACHE_SIZE);
649 goto loop_unlock; 627 goto loop_unlock;
650 } 628 }
651 629
@@ -683,7 +661,6 @@ loop_unlock:
683 page_cache_release(page); 661 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex); 662 mutex_unlock(&inode->i_mutex);
685 663
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 664 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++; 665 i++;
689 } 666 }
@@ -713,9 +690,9 @@ loop_unlock:
713 return 0; 690 return 0;
714 691
715err_reservations: 692err_reservations:
693 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
694err_unlock:
716 mutex_unlock(&inode->i_mutex); 695 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret; 696 return ret;
720} 697}
721 698
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
811 device->name, (unsigned long long)new_size); 788 device->name, (unsigned long long)new_size);
812 789
813 if (new_size > old_size) { 790 if (new_size > old_size) {
814 trans = btrfs_start_transaction(root, 1); 791 trans = btrfs_start_transaction(root, 0);
815 ret = btrfs_grow_device(trans, device, new_size); 792 ret = btrfs_grow_device(trans, device, new_size);
816 btrfs_commit_transaction(trans, root); 793 btrfs_commit_transaction(trans, root);
817 } else { 794 } else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1300 if (err) 1277 if (err)
1301 goto out_up_write; 1278 goto out_up_write;
1302 1279
1303 trans = btrfs_start_transaction(root, 1); 1280 trans = btrfs_start_transaction(root, 0);
1281 if (IS_ERR(trans)) {
1282 err = PTR_ERR(trans);
1283 goto out;
1284 }
1285 trans->block_rsv = &root->fs_info->global_block_rsv;
1286
1304 ret = btrfs_unlink_subvol(trans, root, dir, 1287 ret = btrfs_unlink_subvol(trans, root, dir,
1305 dest->root_key.objectid, 1288 dest->root_key.objectid,
1306 dentry->d_name.name, 1289 dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1314 dest->root_item.drop_level = 0; 1297 dest->root_item.drop_level = 0;
1315 btrfs_set_root_refs(&dest->root_item, 0); 1298 btrfs_set_root_refs(&dest->root_item, 0);
1316 1299
1317 ret = btrfs_insert_orphan_item(trans, 1300 if (!xchg(&dest->orphan_item_inserted, 1)) {
1318 root->fs_info->tree_root, 1301 ret = btrfs_insert_orphan_item(trans,
1319 dest->root_key.objectid); 1302 root->fs_info->tree_root,
1320 BUG_ON(ret); 1303 dest->root_key.objectid);
1304 BUG_ON(ret);
1305 }
1321 1306
1322 ret = btrfs_commit_transaction(trans, root); 1307 ret = btrfs_commit_transaction(trans, root);
1323 BUG_ON(ret); 1308 BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1358 ret = -EPERM; 1343 ret = -EPERM;
1359 goto out; 1344 goto out;
1360 } 1345 }
1361 btrfs_defrag_root(root, 0); 1346 ret = btrfs_defrag_root(root, 0);
1362 btrfs_defrag_root(root->fs_info->extent_root, 0); 1347 if (ret)
1348 goto out;
1349 ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
1363 break; 1350 break;
1364 case S_IFREG: 1351 case S_IFREG:
1365 if (!(file->f_mode & FMODE_WRITE)) { 1352 if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1389 /* the rest are all set to zero by kzalloc */ 1376 /* the rest are all set to zero by kzalloc */
1390 range->len = (u64)-1; 1377 range->len = (u64)-1;
1391 } 1378 }
1392 btrfs_defrag_file(file, range); 1379 ret = btrfs_defrag_file(file, range);
1393 kfree(range); 1380 kfree(range);
1394 break; 1381 break;
1382 default:
1383 ret = -EINVAL;
1395 } 1384 }
1396out: 1385out:
1397 mnt_drop_write(file->f_path.mnt); 1386 mnt_drop_write(file->f_path.mnt);
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1550 btrfs_wait_ordered_range(src, off, off+len); 1539 btrfs_wait_ordered_range(src, off, off+len);
1551 } 1540 }
1552 1541
1553 trans = btrfs_start_transaction(root, 1);
1554 BUG_ON(!trans);
1555
1556 /* punch hole in destination first */
1557 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1558
1559 /* clone data */ 1542 /* clone data */
1560 key.objectid = src->i_ino; 1543 key.objectid = src->i_ino;
1561 key.type = BTRFS_EXTENT_DATA_KEY; 1544 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1566 * note the key will change type as we walk through the 1549 * note the key will change type as we walk through the
1567 * tree. 1550 * tree.
1568 */ 1551 */
1569 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 1552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1570 if (ret < 0) 1553 if (ret < 0)
1571 goto out; 1554 goto out;
1572 1555
@@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1629 new_key.objectid = inode->i_ino; 1612 new_key.objectid = inode->i_ino;
1630 new_key.offset = key.offset + destoff - off; 1613 new_key.offset = key.offset + destoff - off;
1631 1614
1615 trans = btrfs_start_transaction(root, 1);
1616 if (IS_ERR(trans)) {
1617 ret = PTR_ERR(trans);
1618 goto out;
1619 }
1620
1632 if (type == BTRFS_FILE_EXTENT_REG || 1621 if (type == BTRFS_FILE_EXTENT_REG ||
1633 type == BTRFS_FILE_EXTENT_PREALLOC) { 1622 type == BTRFS_FILE_EXTENT_PREALLOC) {
1623 if (off > key.offset) {
1624 datao += off - key.offset;
1625 datal -= off - key.offset;
1626 }
1627
1628 if (key.offset + datal > off + len)
1629 datal = off + len - key.offset;
1630
1631 ret = btrfs_drop_extents(trans, inode,
1632 new_key.offset,
1633 new_key.offset + datal,
1634 &hint_byte, 1);
1635 BUG_ON(ret);
1636
1634 ret = btrfs_insert_empty_item(trans, root, path, 1637 ret = btrfs_insert_empty_item(trans, root, path,
1635 &new_key, size); 1638 &new_key, size);
1636 if (ret) 1639 BUG_ON(ret);
1637 goto out;
1638 1640
1639 leaf = path->nodes[0]; 1641 leaf = path->nodes[0];
1640 slot = path->slots[0]; 1642 slot = path->slots[0];
@@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1645 extent = btrfs_item_ptr(leaf, slot, 1647 extent = btrfs_item_ptr(leaf, slot,
1646 struct btrfs_file_extent_item); 1648 struct btrfs_file_extent_item);
1647 1649
1648 if (off > key.offset) {
1649 datao += off - key.offset;
1650 datal -= off - key.offset;
1651 }
1652
1653 if (key.offset + datal > off + len)
1654 datal = off + len - key.offset;
1655
1656 /* disko == 0 means it's a hole */ 1650 /* disko == 0 means it's a hole */
1657 if (!disko) 1651 if (!disko)
1658 datao = 0; 1652 datao = 0;
@@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1683 1677
1684 if (comp && (skip || trim)) { 1678 if (comp && (skip || trim)) {
1685 ret = -EINVAL; 1679 ret = -EINVAL;
1680 btrfs_end_transaction(trans, root);
1686 goto out; 1681 goto out;
1687 } 1682 }
1688 size -= skip + trim; 1683 size -= skip + trim;
1689 datal -= skip + trim; 1684 datal -= skip + trim;
1685
1686 ret = btrfs_drop_extents(trans, inode,
1687 new_key.offset,
1688 new_key.offset + datal,
1689 &hint_byte, 1);
1690 BUG_ON(ret);
1691
1690 ret = btrfs_insert_empty_item(trans, root, path, 1692 ret = btrfs_insert_empty_item(trans, root, path,
1691 &new_key, size); 1693 &new_key, size);
1692 if (ret) 1694 BUG_ON(ret);
1693 goto out;
1694 1695
1695 if (skip) { 1696 if (skip) {
1696 u32 start = 1697 u32 start =
@@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1708 } 1709 }
1709 1710
1710 btrfs_mark_buffer_dirty(leaf); 1711 btrfs_mark_buffer_dirty(leaf);
1711 } 1712 btrfs_release_path(root, path);
1712 1713
1714 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1715 if (new_key.offset + datal > inode->i_size)
1716 btrfs_i_size_write(inode,
1717 new_key.offset + datal);
1718 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1719 ret = btrfs_update_inode(trans, root, inode);
1720 BUG_ON(ret);
1721 btrfs_end_transaction(trans, root);
1722 }
1713next: 1723next:
1714 btrfs_release_path(root, path); 1724 btrfs_release_path(root, path);
1715 key.offset++; 1725 key.offset++;
@@ -1717,17 +1727,7 @@ next:
1717 ret = 0; 1727 ret = 0;
1718out: 1728out:
1719 btrfs_release_path(root, path); 1729 btrfs_release_path(root, path);
1720 if (ret == 0) {
1721 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1722 if (destoff + olen > inode->i_size)
1723 btrfs_i_size_write(inode, destoff + olen);
1724 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1725 ret = btrfs_update_inode(trans, root, inode);
1726 }
1727 btrfs_end_transaction(trans, root);
1728 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1730 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1729 if (ret)
1730 vmtruncate(inode, 0);
1731out_unlock: 1731out_unlock:
1732 mutex_unlock(&src->i_mutex); 1732 mutex_unlock(&src->i_mutex);
1733 mutex_unlock(&inode->i_mutex); 1733 mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
124 return 1; 124 return 1;
125} 125}
126 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
127/* 136/*
128 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
129 * the first one less than this offset 138 * the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
161 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
162 * inserted. 171 * inserted.
163 */ 172 */
164int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
165 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio)
166{ 176{
167 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
168 struct rb_node *node; 178 struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
182 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
183 set_bit(type, &entry->flags); 193 set_bit(type, &entry->flags);
184 194
195 if (dio)
196 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
197
185 /* one ref for the tree */ 198 /* one ref for the tree */
186 atomic_set(&entry->refs, 1); 199 atomic_set(&entry->refs, 1);
187 init_waitqueue_head(&entry->wait); 200 init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
203 return 0; 216 return 0;
204} 217}
205 218
219int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type)
221{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0);
224}
225
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type)
228{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1);
231}
232
206/* 233/*
207 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 234 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
208 * when an ordered extent is finished. If the list covers more than one 235 * when an ordered extent is finished. If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
311 tree->last = NULL; 338 tree->last = NULL;
312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 339 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
313 340
314 spin_lock(&BTRFS_I(inode)->accounting_lock);
315 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
316 BTRFS_I(inode)->outstanding_extents--;
317 spin_unlock(&BTRFS_I(inode)->accounting_lock);
318 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
319 inode, 1);
320
321 spin_lock(&root->fs_info->ordered_extent_lock); 341 spin_lock(&root->fs_info->ordered_extent_lock);
322 list_del_init(&entry->root_extent_list); 342 list_del_init(&entry->root_extent_list);
323 343
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
491 * start IO on any dirty ones so the wait doesn't stall waiting 511 * start IO on any dirty ones so the wait doesn't stall waiting
492 * for pdflush to find them 512 * for pdflush to find them
493 */ 513 */
494 filemap_fdatawrite_range(inode->i_mapping, start, end); 514 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
515 filemap_fdatawrite_range(inode->i_mapping, start, end);
495 if (wait) { 516 if (wait) {
496 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 517 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
497 &entry->flags)); 518 &entry->flags));
@@ -588,6 +609,47 @@ out:
588 return entry; 609 return entry;
589} 610}
590 611
612/* Since the DIO code tries to lock a wide area we need to look for any ordered
613 * extents that exist in the range, rather than just the start of the range.
614 */
615struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
616 u64 file_offset,
617 u64 len)
618{
619 struct btrfs_ordered_inode_tree *tree;
620 struct rb_node *node;
621 struct btrfs_ordered_extent *entry = NULL;
622
623 tree = &BTRFS_I(inode)->ordered_tree;
624 spin_lock(&tree->lock);
625 node = tree_search(tree, file_offset);
626 if (!node) {
627 node = tree_search(tree, file_offset + len);
628 if (!node)
629 goto out;
630 }
631
632 while (1) {
633 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
634 if (range_overlaps(entry, file_offset, len))
635 break;
636
637 if (entry->file_offset >= file_offset + len) {
638 entry = NULL;
639 break;
640 }
641 entry = NULL;
642 node = rb_next(node);
643 if (!node)
644 break;
645 }
646out:
647 if (entry)
648 atomic_inc(&entry->refs);
649 spin_unlock(&tree->lock);
650 return entry;
651}
652
591/* 653/*
592 * lookup and return any extent before 'file_offset'. NULL is returned 654 * lookup and return any extent before 'file_offset'. NULL is returned
593 * if none is found 655 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
140 struct btrfs_ordered_extent **cached, 142 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size); 143 u64 file_offset, u64 io_size);
142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
143 u64 start, u64 len, u64 disk_len, int tyep); 145 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type);
144int btrfs_add_ordered_sum(struct inode *inode, 148int btrfs_add_ordered_sum(struct inode *inode,
145 struct btrfs_ordered_extent *entry, 149 struct btrfs_ordered_extent *entry,
146 struct btrfs_ordered_sum *sum); 150 struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
151int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 155int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
152struct btrfs_ordered_extent * 156struct btrfs_ordered_extent *
153btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 157btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
158struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
159 u64 file_offset,
160 u64 len);
154int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 161int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
155 struct btrfs_ordered_extent *ordered); 162 struct btrfs_ordered_extent *ordered);
156int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 163int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..05d41e569236 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
44struct backref_node { 44struct backref_node {
45 struct rb_node rb_node; 45 struct rb_node rb_node;
46 u64 bytenr; 46 u64 bytenr;
47 /* objectid tree block owner */ 47
48 u64 new_bytenr;
49 /* objectid of tree block owner, can be not uptodate */
48 u64 owner; 50 u64 owner;
51 /* link to pending, changed or detached list */
52 struct list_head list;
49 /* list of upper level blocks reference this block */ 53 /* list of upper level blocks reference this block */
50 struct list_head upper; 54 struct list_head upper;
51 /* list of child blocks in the cache */ 55 /* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
56 struct extent_buffer *eb; 60 struct extent_buffer *eb;
57 /* level of tree block */ 61 /* level of tree block */
58 unsigned int level:8; 62 unsigned int level:8;
59 /* 1 if the block is root of old snapshot */ 63 /* is the block in non-reference counted tree */
60 unsigned int old_root:1; 64 unsigned int cowonly:1;
61 /* 1 if no child blocks in the cache */ 65 /* 1 if no child node in the cache */
62 unsigned int lowest:1; 66 unsigned int lowest:1;
63 /* is the extent buffer locked */ 67 /* is the extent buffer locked */
64 unsigned int locked:1; 68 unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
66 unsigned int processed:1; 70 unsigned int processed:1;
67 /* have backrefs of this block been checked */ 71 /* have backrefs of this block been checked */
68 unsigned int checked:1; 72 unsigned int checked:1;
73 /*
74 * 1 if corresponding block has been cowed but some upper
75 * level block pointers may not point to the new location
76 */
77 unsigned int pending:1;
78 /*
79 * 1 if the backref node isn't connected to any other
80 * backref node.
81 */
82 unsigned int detached:1;
69}; 83};
70 84
71/* 85/*
@@ -74,7 +88,6 @@ struct backref_node {
74struct backref_edge { 88struct backref_edge {
75 struct list_head list[2]; 89 struct list_head list[2];
76 struct backref_node *node[2]; 90 struct backref_node *node[2];
77 u64 blockptr;
78}; 91};
79 92
80#define LOWER 0 93#define LOWER 0
@@ -83,9 +96,25 @@ struct backref_edge {
83struct backref_cache { 96struct backref_cache {
84 /* red black tree of all backref nodes in the cache */ 97 /* red black tree of all backref nodes in the cache */
85 struct rb_root rb_root; 98 struct rb_root rb_root;
86 /* list of backref nodes with no child block in the cache */ 99 /* for passing backref nodes to btrfs_reloc_cow_block */
100 struct backref_node *path[BTRFS_MAX_LEVEL];
101 /*
102 * list of blocks that have been cowed but some block
103 * pointers in upper level blocks may not reflect the
104 * new location
105 */
87 struct list_head pending[BTRFS_MAX_LEVEL]; 106 struct list_head pending[BTRFS_MAX_LEVEL];
88 spinlock_t lock; 107 /* list of backref nodes with no child node */
108 struct list_head leaves;
109 /* list of blocks that have been cowed in current transaction */
110 struct list_head changed;
111 /* list of detached backref node. */
112 struct list_head detached;
113
114 u64 last_trans;
115
116 int nr_nodes;
117 int nr_edges;
89}; 118};
90 119
91/* 120/*
@@ -113,15 +142,6 @@ struct tree_block {
113 unsigned int key_ready:1; 142 unsigned int key_ready:1;
114}; 143};
115 144
116/* inode vector */
117#define INODEVEC_SIZE 16
118
119struct inodevec {
120 struct list_head list;
121 struct inode *inode[INODEVEC_SIZE];
122 int nr;
123};
124
125#define MAX_EXTENTS 128 145#define MAX_EXTENTS 128
126 146
127struct file_extent_cluster { 147struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
138 struct btrfs_root *extent_root; 158 struct btrfs_root *extent_root;
139 /* inode for moving data */ 159 /* inode for moving data */
140 struct inode *data_inode; 160 struct inode *data_inode;
141 struct btrfs_workers workers; 161
162 struct btrfs_block_rsv *block_rsv;
163
164 struct backref_cache backref_cache;
165
166 struct file_extent_cluster cluster;
142 /* tree blocks have been processed */ 167 /* tree blocks have been processed */
143 struct extent_io_tree processed_blocks; 168 struct extent_io_tree processed_blocks;
144 /* map start of tree root to corresponding reloc tree */ 169 /* map start of tree root to corresponding reloc tree */
145 struct mapping_tree reloc_root_tree; 170 struct mapping_tree reloc_root_tree;
146 /* list of reloc trees */ 171 /* list of reloc trees */
147 struct list_head reloc_roots; 172 struct list_head reloc_roots;
173 /* size of metadata reservation for merging reloc trees */
174 u64 merging_rsv_size;
175 /* size of relocated tree nodes */
176 u64 nodes_relocated;
177
148 u64 search_start; 178 u64 search_start;
149 u64 extents_found; 179 u64 extents_found;
150 u64 extents_skipped; 180
151 int stage; 181 int block_rsv_retries;
152 int create_reloc_root; 182
183 unsigned int stage:8;
184 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1;
153 unsigned int found_file_extent:1; 186 unsigned int found_file_extent:1;
154 unsigned int found_old_snapshot:1; 187 unsigned int commit_transaction:1;
155}; 188};
156 189
157/* stages of data relocation */ 190/* stages of data relocation */
158#define MOVE_DATA_EXTENTS 0 191#define MOVE_DATA_EXTENTS 0
159#define UPDATE_DATA_PTRS 1 192#define UPDATE_DATA_PTRS 1
160 193
161/* 194static void remove_backref_node(struct backref_cache *cache,
162 * merge reloc tree to corresponding fs tree in worker threads 195 struct backref_node *node);
163 */ 196static void __mark_block_processed(struct reloc_control *rc,
164struct async_merge { 197 struct backref_node *node);
165 struct btrfs_work work;
166 struct reloc_control *rc;
167 struct btrfs_root *root;
168 struct completion *done;
169 atomic_t *num_pending;
170};
171 198
172static void mapping_tree_init(struct mapping_tree *tree) 199static void mapping_tree_init(struct mapping_tree *tree)
173{ 200{
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
181 cache->rb_root = RB_ROOT; 208 cache->rb_root = RB_ROOT;
182 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 209 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
183 INIT_LIST_HEAD(&cache->pending[i]); 210 INIT_LIST_HEAD(&cache->pending[i]);
184 spin_lock_init(&cache->lock); 211 INIT_LIST_HEAD(&cache->changed);
212 INIT_LIST_HEAD(&cache->detached);
213 INIT_LIST_HEAD(&cache->leaves);
214}
215
216static void backref_cache_cleanup(struct backref_cache *cache)
217{
218 struct backref_node *node;
219 int i;
220
221 while (!list_empty(&cache->detached)) {
222 node = list_entry(cache->detached.next,
223 struct backref_node, list);
224 remove_backref_node(cache, node);
225 }
226
227 while (!list_empty(&cache->leaves)) {
228 node = list_entry(cache->leaves.next,
229 struct backref_node, lower);
230 remove_backref_node(cache, node);
231 }
232
233 cache->last_trans = 0;
234
235 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
236 BUG_ON(!list_empty(&cache->pending[i]));
237 BUG_ON(!list_empty(&cache->changed));
238 BUG_ON(!list_empty(&cache->detached));
239 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
240 BUG_ON(cache->nr_nodes);
241 BUG_ON(cache->nr_edges);
242}
243
244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
245{
246 struct backref_node *node;
247
248 node = kzalloc(sizeof(*node), GFP_NOFS);
249 if (node) {
250 INIT_LIST_HEAD(&node->list);
251 INIT_LIST_HEAD(&node->upper);
252 INIT_LIST_HEAD(&node->lower);
253 RB_CLEAR_NODE(&node->rb_node);
254 cache->nr_nodes++;
255 }
256 return node;
257}
258
259static void free_backref_node(struct backref_cache *cache,
260 struct backref_node *node)
261{
262 if (node) {
263 cache->nr_nodes--;
264 kfree(node);
265 }
266}
267
268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
269{
270 struct backref_edge *edge;
271
272 edge = kzalloc(sizeof(*edge), GFP_NOFS);
273 if (edge)
274 cache->nr_edges++;
275 return edge;
185} 276}
186 277
187static void backref_node_init(struct backref_node *node) 278static void free_backref_edge(struct backref_cache *cache,
279 struct backref_edge *edge)
188{ 280{
189 memset(node, 0, sizeof(*node)); 281 if (edge) {
190 INIT_LIST_HEAD(&node->upper); 282 cache->nr_edges--;
191 INIT_LIST_HEAD(&node->lower); 283 kfree(edge);
192 RB_CLEAR_NODE(&node->rb_node); 284 }
193} 285}
194 286
195static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
250 edges[idx++] = edge; 342 edges[idx++] = edge;
251 node = edge->node[UPPER]; 343 node = edge->node[UPPER];
252 } 344 }
345 BUG_ON(node->detached);
253 *index = idx; 346 *index = idx;
254 return node; 347 return node;
255} 348}
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
281 return NULL; 374 return NULL;
282} 375}
283 376
377static void unlock_node_buffer(struct backref_node *node)
378{
379 if (node->locked) {
380 btrfs_tree_unlock(node->eb);
381 node->locked = 0;
382 }
383}
384
284static void drop_node_buffer(struct backref_node *node) 385static void drop_node_buffer(struct backref_node *node)
285{ 386{
286 if (node->eb) { 387 if (node->eb) {
287 if (node->locked) { 388 unlock_node_buffer(node);
288 btrfs_tree_unlock(node->eb);
289 node->locked = 0;
290 }
291 free_extent_buffer(node->eb); 389 free_extent_buffer(node->eb);
292 node->eb = NULL; 390 node->eb = NULL;
293 } 391 }
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
296static void drop_backref_node(struct backref_cache *tree, 394static void drop_backref_node(struct backref_cache *tree,
297 struct backref_node *node) 395 struct backref_node *node)
298{ 396{
299 BUG_ON(!node->lowest);
300 BUG_ON(!list_empty(&node->upper)); 397 BUG_ON(!list_empty(&node->upper));
301 398
302 drop_node_buffer(node); 399 drop_node_buffer(node);
400 list_del(&node->list);
303 list_del(&node->lower); 401 list_del(&node->lower);
304 402 if (!RB_EMPTY_NODE(&node->rb_node))
305 rb_erase(&node->rb_node, &tree->rb_root); 403 rb_erase(&node->rb_node, &tree->rb_root);
306 kfree(node); 404 free_backref_node(tree, node);
307} 405}
308 406
309/* 407/*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
318 if (!node) 416 if (!node)
319 return; 417 return;
320 418
321 BUG_ON(!node->lowest); 419 BUG_ON(!node->lowest && !node->detached);
322 while (!list_empty(&node->upper)) { 420 while (!list_empty(&node->upper)) {
323 edge = list_entry(node->upper.next, struct backref_edge, 421 edge = list_entry(node->upper.next, struct backref_edge,
324 list[LOWER]); 422 list[LOWER]);
325 upper = edge->node[UPPER]; 423 upper = edge->node[UPPER];
326 list_del(&edge->list[LOWER]); 424 list_del(&edge->list[LOWER]);
327 list_del(&edge->list[UPPER]); 425 list_del(&edge->list[UPPER]);
328 kfree(edge); 426 free_backref_edge(cache, edge);
427
428 if (RB_EMPTY_NODE(&upper->rb_node)) {
429 BUG_ON(!list_empty(&node->upper));
430 drop_backref_node(cache, node);
431 node = upper;
432 node->lowest = 1;
433 continue;
434 }
329 /* 435 /*
330 * add the node to pending list if no other 436 * add the node to leaf node list if no other
331 * child block cached. 437 * child block cached.
332 */ 438 */
333 if (list_empty(&upper->lower)) { 439 if (list_empty(&upper->lower)) {
334 list_add_tail(&upper->lower, 440 list_add_tail(&upper->lower, &cache->leaves);
335 &cache->pending[upper->level]);
336 upper->lowest = 1; 441 upper->lowest = 1;
337 } 442 }
338 } 443 }
444
339 drop_backref_node(cache, node); 445 drop_backref_node(cache, node);
340} 446}
341 447
448static void update_backref_node(struct backref_cache *cache,
449 struct backref_node *node, u64 bytenr)
450{
451 struct rb_node *rb_node;
452 rb_erase(&node->rb_node, &cache->rb_root);
453 node->bytenr = bytenr;
454 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
455 BUG_ON(rb_node);
456}
457
458/*
459 * update backref cache after a transaction commit
460 */
461static int update_backref_cache(struct btrfs_trans_handle *trans,
462 struct backref_cache *cache)
463{
464 struct backref_node *node;
465 int level = 0;
466
467 if (cache->last_trans == 0) {
468 cache->last_trans = trans->transid;
469 return 0;
470 }
471
472 if (cache->last_trans == trans->transid)
473 return 0;
474
475 /*
476 * detached nodes are used to avoid unnecessary backref
477 * lookup. transaction commit changes the extent tree.
478 * so the detached nodes are no longer useful.
479 */
480 while (!list_empty(&cache->detached)) {
481 node = list_entry(cache->detached.next,
482 struct backref_node, list);
483 remove_backref_node(cache, node);
484 }
485
486 while (!list_empty(&cache->changed)) {
487 node = list_entry(cache->changed.next,
488 struct backref_node, list);
489 list_del_init(&node->list);
490 BUG_ON(node->pending);
491 update_backref_node(cache, node, node->new_bytenr);
492 }
493
494 /*
495 * some nodes can be left in the pending list if there were
496 * errors during processing the pending nodes.
497 */
498 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
499 list_for_each_entry(node, &cache->pending[level], list) {
500 BUG_ON(!node->pending);
501 if (node->bytenr == node->new_bytenr)
502 continue;
503 update_backref_node(cache, node, node->new_bytenr);
504 }
505 }
506
507 cache->last_trans = 0;
508 return 1;
509}
510
511static int should_ignore_root(struct btrfs_root *root)
512{
513 struct btrfs_root *reloc_root;
514
515 if (!root->ref_cows)
516 return 0;
517
518 reloc_root = root->reloc_root;
519 if (!reloc_root)
520 return 0;
521
522 if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
523 root->fs_info->running_transaction->transid - 1)
524 return 0;
525 /*
526 * if there is reloc tree and it was created in previous
527 * transaction backref lookup can find the reloc tree,
528 * so backref node for the fs tree root is useless for
529 * relocation.
530 */
531 return 1;
532}
533
342/* 534/*
343 * find reloc tree by address of tree root 535 * find reloc tree by address of tree root
344 */ 536 */
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
453 * for all upper level blocks that directly/indirectly reference the 645 * for all upper level blocks that directly/indirectly reference the
454 * block are also cached. 646 * block are also cached.
455 */ 647 */
456static struct backref_node *build_backref_tree(struct reloc_control *rc, 648static noinline_for_stack
457 struct backref_cache *cache, 649struct backref_node *build_backref_tree(struct reloc_control *rc,
458 struct btrfs_key *node_key, 650 struct btrfs_key *node_key,
459 int level, u64 bytenr) 651 int level, u64 bytenr)
460{ 652{
653 struct backref_cache *cache = &rc->backref_cache;
461 struct btrfs_path *path1; 654 struct btrfs_path *path1;
462 struct btrfs_path *path2; 655 struct btrfs_path *path2;
463 struct extent_buffer *eb; 656 struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
473 unsigned long end; 666 unsigned long end;
474 unsigned long ptr; 667 unsigned long ptr;
475 LIST_HEAD(list); 668 LIST_HEAD(list);
669 LIST_HEAD(useless);
670 int cowonly;
476 int ret; 671 int ret;
477 int err = 0; 672 int err = 0;
478 673
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
483 goto out; 678 goto out;
484 } 679 }
485 680
486 node = kmalloc(sizeof(*node), GFP_NOFS); 681 node = alloc_backref_node(cache);
487 if (!node) { 682 if (!node) {
488 err = -ENOMEM; 683 err = -ENOMEM;
489 goto out; 684 goto out;
490 } 685 }
491 686
492 backref_node_init(node);
493 node->bytenr = bytenr; 687 node->bytenr = bytenr;
494 node->owner = 0;
495 node->level = level; 688 node->level = level;
496 node->lowest = 1; 689 node->lowest = 1;
497 cur = node; 690 cur = node;
@@ -587,17 +780,20 @@ again:
587#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 780#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
588 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || 781 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
589 key.type == BTRFS_EXTENT_REF_V0_KEY) { 782 key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 if (key.objectid == key.offset && 783 if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 key.type == BTRFS_EXTENT_REF_V0_KEY) {
592 struct btrfs_extent_ref_v0 *ref0; 784 struct btrfs_extent_ref_v0 *ref0;
593 ref0 = btrfs_item_ptr(eb, path1->slots[0], 785 ref0 = btrfs_item_ptr(eb, path1->slots[0],
594 struct btrfs_extent_ref_v0); 786 struct btrfs_extent_ref_v0);
595 root = find_tree_root(rc, eb, ref0); 787 root = find_tree_root(rc, eb, ref0);
596 if (root) 788 if (!root->ref_cows)
597 cur->root = root; 789 cur->cowonly = 1;
598 else 790 if (key.objectid == key.offset) {
599 cur->old_root = 1; 791 if (root && !should_ignore_root(root))
600 break; 792 cur->root = root;
793 else
794 list_add(&cur->list, &useless);
795 break;
796 }
601 } 797 }
602#else 798#else
603 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 799 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +810,20 @@ again:
614 break; 810 break;
615 } 811 }
616 812
617 edge = kzalloc(sizeof(*edge), GFP_NOFS); 813 edge = alloc_backref_edge(cache);
618 if (!edge) { 814 if (!edge) {
619 err = -ENOMEM; 815 err = -ENOMEM;
620 goto out; 816 goto out;
621 } 817 }
622 rb_node = tree_search(&cache->rb_root, key.offset); 818 rb_node = tree_search(&cache->rb_root, key.offset);
623 if (!rb_node) { 819 if (!rb_node) {
624 upper = kmalloc(sizeof(*upper), GFP_NOFS); 820 upper = alloc_backref_node(cache);
625 if (!upper) { 821 if (!upper) {
626 kfree(edge); 822 free_backref_edge(cache, edge);
627 err = -ENOMEM; 823 err = -ENOMEM;
628 goto out; 824 goto out;
629 } 825 }
630 backref_node_init(upper);
631 upper->bytenr = key.offset; 826 upper->bytenr = key.offset;
632 upper->owner = 0;
633 upper->level = cur->level + 1; 827 upper->level = cur->level + 1;
634 /* 828 /*
635 * backrefs for the upper level block isn't 829 * backrefs for the upper level block isn't
@@ -639,11 +833,12 @@ again:
639 } else { 833 } else {
640 upper = rb_entry(rb_node, struct backref_node, 834 upper = rb_entry(rb_node, struct backref_node,
641 rb_node); 835 rb_node);
836 BUG_ON(!upper->checked);
642 INIT_LIST_HEAD(&edge->list[UPPER]); 837 INIT_LIST_HEAD(&edge->list[UPPER]);
643 } 838 }
644 list_add(&edge->list[LOWER], &cur->upper); 839 list_add_tail(&edge->list[LOWER], &cur->upper);
645 edge->node[UPPER] = upper;
646 edge->node[LOWER] = cur; 840 edge->node[LOWER] = cur;
841 edge->node[UPPER] = upper;
647 842
648 goto next; 843 goto next;
649 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 844 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +852,17 @@ again:
657 goto out; 852 goto out;
658 } 853 }
659 854
855 if (!root->ref_cows)
856 cur->cowonly = 1;
857
660 if (btrfs_root_level(&root->root_item) == cur->level) { 858 if (btrfs_root_level(&root->root_item) == cur->level) {
661 /* tree root */ 859 /* tree root */
662 BUG_ON(btrfs_root_bytenr(&root->root_item) != 860 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
663 cur->bytenr); 861 cur->bytenr);
664 cur->root = root; 862 if (should_ignore_root(root))
863 list_add(&cur->list, &useless);
864 else
865 cur->root = root;
665 break; 866 break;
666 } 867 }
667 868
@@ -692,11 +893,14 @@ again:
692 if (!path2->nodes[level]) { 893 if (!path2->nodes[level]) {
693 BUG_ON(btrfs_root_bytenr(&root->root_item) != 894 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
694 lower->bytenr); 895 lower->bytenr);
695 lower->root = root; 896 if (should_ignore_root(root))
897 list_add(&lower->list, &useless);
898 else
899 lower->root = root;
696 break; 900 break;
697 } 901 }
698 902
699 edge = kzalloc(sizeof(*edge), GFP_NOFS); 903 edge = alloc_backref_edge(cache);
700 if (!edge) { 904 if (!edge) {
701 err = -ENOMEM; 905 err = -ENOMEM;
702 goto out; 906 goto out;
@@ -705,16 +909,17 @@ again:
705 eb = path2->nodes[level]; 909 eb = path2->nodes[level];
706 rb_node = tree_search(&cache->rb_root, eb->start); 910 rb_node = tree_search(&cache->rb_root, eb->start);
707 if (!rb_node) { 911 if (!rb_node) {
708 upper = kmalloc(sizeof(*upper), GFP_NOFS); 912 upper = alloc_backref_node(cache);
709 if (!upper) { 913 if (!upper) {
710 kfree(edge); 914 free_backref_edge(cache, edge);
711 err = -ENOMEM; 915 err = -ENOMEM;
712 goto out; 916 goto out;
713 } 917 }
714 backref_node_init(upper);
715 upper->bytenr = eb->start; 918 upper->bytenr = eb->start;
716 upper->owner = btrfs_header_owner(eb); 919 upper->owner = btrfs_header_owner(eb);
717 upper->level = lower->level + 1; 920 upper->level = lower->level + 1;
921 if (!root->ref_cows)
922 upper->cowonly = 1;
718 923
719 /* 924 /*
720 * if we know the block isn't shared 925 * if we know the block isn't shared
@@ -744,10 +949,12 @@ again:
744 rb_node); 949 rb_node);
745 BUG_ON(!upper->checked); 950 BUG_ON(!upper->checked);
746 INIT_LIST_HEAD(&edge->list[UPPER]); 951 INIT_LIST_HEAD(&edge->list[UPPER]);
952 if (!upper->owner)
953 upper->owner = btrfs_header_owner(eb);
747 } 954 }
748 list_add_tail(&edge->list[LOWER], &lower->upper); 955 list_add_tail(&edge->list[LOWER], &lower->upper);
749 edge->node[UPPER] = upper;
750 edge->node[LOWER] = lower; 956 edge->node[LOWER] = lower;
957 edge->node[UPPER] = upper;
751 958
752 if (rb_node) 959 if (rb_node)
753 break; 960 break;
@@ -785,8 +992,13 @@ next:
785 * into the cache. 992 * into the cache.
786 */ 993 */
787 BUG_ON(!node->checked); 994 BUG_ON(!node->checked);
788 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 995 cowonly = node->cowonly;
789 BUG_ON(rb_node); 996 if (!cowonly) {
997 rb_node = tree_insert(&cache->rb_root, node->bytenr,
998 &node->rb_node);
999 BUG_ON(rb_node);
1000 list_add_tail(&node->lower, &cache->leaves);
1001 }
790 1002
791 list_for_each_entry(edge, &node->upper, list[LOWER]) 1003 list_for_each_entry(edge, &node->upper, list[LOWER])
792 list_add_tail(&edge->list[UPPER], &list); 1004 list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1007,14 @@ next:
795 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1007 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
796 list_del_init(&edge->list[UPPER]); 1008 list_del_init(&edge->list[UPPER]);
797 upper = edge->node[UPPER]; 1009 upper = edge->node[UPPER];
1010 if (upper->detached) {
1011 list_del(&edge->list[LOWER]);
1012 lower = edge->node[LOWER];
1013 free_backref_edge(cache, edge);
1014 if (list_empty(&lower->upper))
1015 list_add(&lower->list, &useless);
1016 continue;
1017 }
798 1018
799 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1019 if (!RB_EMPTY_NODE(&upper->rb_node)) {
800 if (upper->lowest) { 1020 if (upper->lowest) {
@@ -807,25 +1027,69 @@ next:
807 } 1027 }
808 1028
809 BUG_ON(!upper->checked); 1029 BUG_ON(!upper->checked);
810 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1030 BUG_ON(cowonly != upper->cowonly);
811 &upper->rb_node); 1031 if (!cowonly) {
812 BUG_ON(rb_node); 1032 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1033 &upper->rb_node);
1034 BUG_ON(rb_node);
1035 }
813 1036
814 list_add_tail(&edge->list[UPPER], &upper->lower); 1037 list_add_tail(&edge->list[UPPER], &upper->lower);
815 1038
816 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1039 list_for_each_entry(edge, &upper->upper, list[LOWER])
817 list_add_tail(&edge->list[UPPER], &list); 1040 list_add_tail(&edge->list[UPPER], &list);
818 } 1041 }
1042 /*
1043 * process useless backref nodes. backref nodes for tree leaves
1044 * are deleted from the cache. backref nodes for upper level
1045 * tree blocks are left in the cache to avoid unnecessary backref
1046 * lookup.
1047 */
1048 while (!list_empty(&useless)) {
1049 upper = list_entry(useless.next, struct backref_node, list);
1050 list_del_init(&upper->list);
1051 BUG_ON(!list_empty(&upper->upper));
1052 if (upper == node)
1053 node = NULL;
1054 if (upper->lowest) {
1055 list_del_init(&upper->lower);
1056 upper->lowest = 0;
1057 }
1058 while (!list_empty(&upper->lower)) {
1059 edge = list_entry(upper->lower.next,
1060 struct backref_edge, list[UPPER]);
1061 list_del(&edge->list[UPPER]);
1062 list_del(&edge->list[LOWER]);
1063 lower = edge->node[LOWER];
1064 free_backref_edge(cache, edge);
1065
1066 if (list_empty(&lower->upper))
1067 list_add(&lower->list, &useless);
1068 }
1069 __mark_block_processed(rc, upper);
1070 if (upper->level > 0) {
1071 list_add(&upper->list, &cache->detached);
1072 upper->detached = 1;
1073 } else {
1074 rb_erase(&upper->rb_node, &cache->rb_root);
1075 free_backref_node(cache, upper);
1076 }
1077 }
819out: 1078out:
820 btrfs_free_path(path1); 1079 btrfs_free_path(path1);
821 btrfs_free_path(path2); 1080 btrfs_free_path(path2);
822 if (err) { 1081 if (err) {
823 INIT_LIST_HEAD(&list); 1082 while (!list_empty(&useless)) {
1083 lower = list_entry(useless.next,
1084 struct backref_node, upper);
1085 list_del_init(&lower->upper);
1086 }
824 upper = node; 1087 upper = node;
1088 INIT_LIST_HEAD(&list);
825 while (upper) { 1089 while (upper) {
826 if (RB_EMPTY_NODE(&upper->rb_node)) { 1090 if (RB_EMPTY_NODE(&upper->rb_node)) {
827 list_splice_tail(&upper->upper, &list); 1091 list_splice_tail(&upper->upper, &list);
828 kfree(upper); 1092 free_backref_node(cache, upper);
829 } 1093 }
830 1094
831 if (list_empty(&list)) 1095 if (list_empty(&list))
@@ -833,15 +1097,104 @@ out:
833 1097
834 edge = list_entry(list.next, struct backref_edge, 1098 edge = list_entry(list.next, struct backref_edge,
835 list[LOWER]); 1099 list[LOWER]);
1100 list_del(&edge->list[LOWER]);
836 upper = edge->node[UPPER]; 1101 upper = edge->node[UPPER];
837 kfree(edge); 1102 free_backref_edge(cache, edge);
838 } 1103 }
839 return ERR_PTR(err); 1104 return ERR_PTR(err);
840 } 1105 }
1106 BUG_ON(node && node->detached);
841 return node; 1107 return node;
842} 1108}
843 1109
844/* 1110/*
1111 * helper to add backref node for the newly created snapshot.
1112 * the backref node is created by cloning backref node that
1113 * corresponds to root of source tree
1114 */
1115static int clone_backref_node(struct btrfs_trans_handle *trans,
1116 struct reloc_control *rc,
1117 struct btrfs_root *src,
1118 struct btrfs_root *dest)
1119{
1120 struct btrfs_root *reloc_root = src->reloc_root;
1121 struct backref_cache *cache = &rc->backref_cache;
1122 struct backref_node *node = NULL;
1123 struct backref_node *new_node;
1124 struct backref_edge *edge;
1125 struct backref_edge *new_edge;
1126 struct rb_node *rb_node;
1127
1128 if (cache->last_trans > 0)
1129 update_backref_cache(trans, cache);
1130
1131 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1132 if (rb_node) {
1133 node = rb_entry(rb_node, struct backref_node, rb_node);
1134 if (node->detached)
1135 node = NULL;
1136 else
1137 BUG_ON(node->new_bytenr != reloc_root->node->start);
1138 }
1139
1140 if (!node) {
1141 rb_node = tree_search(&cache->rb_root,
1142 reloc_root->commit_root->start);
1143 if (rb_node) {
1144 node = rb_entry(rb_node, struct backref_node,
1145 rb_node);
1146 BUG_ON(node->detached);
1147 }
1148 }
1149
1150 if (!node)
1151 return 0;
1152
1153 new_node = alloc_backref_node(cache);
1154 if (!new_node)
1155 return -ENOMEM;
1156
1157 new_node->bytenr = dest->node->start;
1158 new_node->level = node->level;
1159 new_node->lowest = node->lowest;
1160 new_node->root = dest;
1161
1162 if (!node->lowest) {
1163 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1164 new_edge = alloc_backref_edge(cache);
1165 if (!new_edge)
1166 goto fail;
1167
1168 new_edge->node[UPPER] = new_node;
1169 new_edge->node[LOWER] = edge->node[LOWER];
1170 list_add_tail(&new_edge->list[UPPER],
1171 &new_node->lower);
1172 }
1173 }
1174
1175 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1176 &new_node->rb_node);
1177 BUG_ON(rb_node);
1178
1179 if (!new_node->lowest) {
1180 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1181 list_add_tail(&new_edge->list[LOWER],
1182 &new_edge->node[LOWER]->upper);
1183 }
1184 }
1185 return 0;
1186fail:
1187 while (!list_empty(&new_node->lower)) {
1188 new_edge = list_entry(new_node->lower.next,
1189 struct backref_edge, list[UPPER]);
1190 list_del(&new_edge->list[UPPER]);
1191 free_backref_edge(cache, new_edge);
1192 }
1193 free_backref_node(cache, new_node);
1194 return -ENOMEM;
1195}
1196
1197/*
845 * helper to add 'address of tree root -> reloc tree' mapping 1198 * helper to add 'address of tree root -> reloc tree' mapping
846 */ 1199 */
847static int __add_reloc_root(struct btrfs_root *root) 1200static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
901 return 0; 1254 return 0;
902} 1255}
903 1256
904/* 1257static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
905 * create reloc tree for a given fs tree. reloc tree is just a 1258 struct btrfs_root *root, u64 objectid)
906 * snapshot of the fs tree with special root objectid.
907 */
908int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
909 struct btrfs_root *root)
910{ 1259{
911 struct btrfs_root *reloc_root; 1260 struct btrfs_root *reloc_root;
912 struct extent_buffer *eb; 1261 struct extent_buffer *eb;
@@ -914,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
914 struct btrfs_key root_key; 1263 struct btrfs_key root_key;
915 int ret; 1264 int ret;
916 1265
917 if (root->reloc_root) {
918 reloc_root = root->reloc_root;
919 reloc_root->last_trans = trans->transid;
920 return 0;
921 }
922
923 if (!root->fs_info->reloc_ctl ||
924 !root->fs_info->reloc_ctl->create_reloc_root ||
925 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
926 return 0;
927
928 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1266 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
929 BUG_ON(!root_item); 1267 BUG_ON(!root_item);
930 1268
931 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1269 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
932 root_key.type = BTRFS_ROOT_ITEM_KEY; 1270 root_key.type = BTRFS_ROOT_ITEM_KEY;
933 root_key.offset = root->root_key.objectid; 1271 root_key.offset = objectid;
934 1272
935 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1273 if (root->root_key.objectid == objectid) {
936 BTRFS_TREE_RELOC_OBJECTID); 1274 /* called by btrfs_init_reloc_root */
937 BUG_ON(ret); 1275 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1276 BTRFS_TREE_RELOC_OBJECTID);
1277 BUG_ON(ret);
1278
1279 btrfs_set_root_last_snapshot(&root->root_item,
1280 trans->transid - 1);
1281 } else {
1282 /*
1283 * called by btrfs_reloc_post_snapshot_hook.
1284 * the source tree is a reloc tree, all tree blocks
1285 * modified after it was created have RELOC flag
1286 * set in their headers. so it's OK to not update
1287 * the 'last_snapshot'.
1288 */
1289 ret = btrfs_copy_root(trans, root, root->node, &eb,
1290 BTRFS_TREE_RELOC_OBJECTID);
1291 BUG_ON(ret);
1292 }
938 1293
939 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
940 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1294 memcpy(root_item, &root->root_item, sizeof(*root_item));
941 btrfs_set_root_refs(root_item, 1);
942 btrfs_set_root_bytenr(root_item, eb->start); 1295 btrfs_set_root_bytenr(root_item, eb->start);
943 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1296 btrfs_set_root_level(root_item, btrfs_header_level(eb));
944 btrfs_set_root_generation(root_item, trans->transid); 1297 btrfs_set_root_generation(root_item, trans->transid);
945 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); 1298
946 root_item->drop_level = 0; 1299 if (root->root_key.objectid == objectid) {
1300 btrfs_set_root_refs(root_item, 0);
1301 memset(&root_item->drop_progress, 0,
1302 sizeof(struct btrfs_disk_key));
1303 root_item->drop_level = 0;
1304 }
947 1305
948 btrfs_tree_unlock(eb); 1306 btrfs_tree_unlock(eb);
949 free_extent_buffer(eb); 1307 free_extent_buffer(eb);
@@ -957,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
957 &root_key); 1315 &root_key);
958 BUG_ON(IS_ERR(reloc_root)); 1316 BUG_ON(IS_ERR(reloc_root));
959 reloc_root->last_trans = trans->transid; 1317 reloc_root->last_trans = trans->transid;
1318 return reloc_root;
1319}
1320
1321/*
1322 * create reloc tree for a given fs tree. reloc tree is just a
1323 * snapshot of the fs tree with special root objectid.
1324 */
1325int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1326 struct btrfs_root *root)
1327{
1328 struct btrfs_root *reloc_root;
1329 struct reloc_control *rc = root->fs_info->reloc_ctl;
1330 int clear_rsv = 0;
1331
1332 if (root->reloc_root) {
1333 reloc_root = root->reloc_root;
1334 reloc_root->last_trans = trans->transid;
1335 return 0;
1336 }
1337
1338 if (!rc || !rc->create_reloc_tree ||
1339 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1340 return 0;
1341
1342 if (!trans->block_rsv) {
1343 trans->block_rsv = rc->block_rsv;
1344 clear_rsv = 1;
1345 }
1346 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1347 if (clear_rsv)
1348 trans->block_rsv = NULL;
960 1349
961 __add_reloc_root(reloc_root); 1350 __add_reloc_root(reloc_root);
962 root->reloc_root = reloc_root; 1351 root->reloc_root = reloc_root;
@@ -980,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
980 reloc_root = root->reloc_root; 1369 reloc_root = root->reloc_root;
981 root_item = &reloc_root->root_item; 1370 root_item = &reloc_root->root_item;
982 1371
983 if (btrfs_root_refs(root_item) == 0) { 1372 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1373 btrfs_root_refs(root_item) == 0) {
984 root->reloc_root = NULL; 1374 root->reloc_root = NULL;
985 del = 1; 1375 del = 1;
986 } 1376 }
@@ -1102,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1102 goto out; 1492 goto out;
1103 } 1493 }
1104 1494
1105 if (new_bytenr) 1495 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1107 ret = 0; 1496 ret = 0;
1108out: 1497out:
1109 btrfs_free_path(path); 1498 btrfs_free_path(path);
@@ -1114,19 +1503,18 @@ out:
1114 * update file extent items in the tree leaf to point to 1503 * update file extent items in the tree leaf to point to
1115 * the new locations. 1504 * the new locations.
1116 */ 1505 */
1117static int replace_file_extents(struct btrfs_trans_handle *trans, 1506static noinline_for_stack
1118 struct reloc_control *rc, 1507int replace_file_extents(struct btrfs_trans_handle *trans,
1119 struct btrfs_root *root, 1508 struct reloc_control *rc,
1120 struct extent_buffer *leaf, 1509 struct btrfs_root *root,
1121 struct list_head *inode_list) 1510 struct extent_buffer *leaf)
1122{ 1511{
1123 struct btrfs_key key; 1512 struct btrfs_key key;
1124 struct btrfs_file_extent_item *fi; 1513 struct btrfs_file_extent_item *fi;
1125 struct inode *inode = NULL; 1514 struct inode *inode = NULL;
1126 struct inodevec *ivec = NULL;
1127 u64 parent; 1515 u64 parent;
1128 u64 bytenr; 1516 u64 bytenr;
1129 u64 new_bytenr; 1517 u64 new_bytenr = 0;
1130 u64 num_bytes; 1518 u64 num_bytes;
1131 u64 end; 1519 u64 end;
1132 u32 nritems; 1520 u32 nritems;
@@ -1166,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1166 * to complete and drop the extent cache 1554 * to complete and drop the extent cache
1167 */ 1555 */
1168 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1556 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1169 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1170 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1171 BUG_ON(!ivec);
1172 ivec->nr = 0;
1173 list_add_tail(&ivec->list, inode_list);
1174 }
1175 if (first) { 1557 if (first) {
1176 inode = find_next_inode(root, key.objectid); 1558 inode = find_next_inode(root, key.objectid);
1177 if (inode)
1178 ivec->inode[ivec->nr++] = inode;
1179 first = 0; 1559 first = 0;
1180 } else if (inode && inode->i_ino < key.objectid) { 1560 } else if (inode && inode->i_ino < key.objectid) {
1561 btrfs_add_delayed_iput(inode);
1181 inode = find_next_inode(root, key.objectid); 1562 inode = find_next_inode(root, key.objectid);
1182 if (inode)
1183 ivec->inode[ivec->nr++] = inode;
1184 } 1563 }
1185 if (inode && inode->i_ino == key.objectid) { 1564 if (inode && inode->i_ino == key.objectid) {
1186 end = key.offset + 1565 end = key.offset +
@@ -1204,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1204 1583
1205 ret = get_new_location(rc->data_inode, &new_bytenr, 1584 ret = get_new_location(rc->data_inode, &new_bytenr,
1206 bytenr, num_bytes); 1585 bytenr, num_bytes);
1207 if (ret > 0) 1586 if (ret > 0) {
1587 WARN_ON(1);
1208 continue; 1588 continue;
1589 }
1209 BUG_ON(ret < 0); 1590 BUG_ON(ret < 0);
1210 1591
1211 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1592 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1225 } 1606 }
1226 if (dirty) 1607 if (dirty)
1227 btrfs_mark_buffer_dirty(leaf); 1608 btrfs_mark_buffer_dirty(leaf);
1609 if (inode)
1610 btrfs_add_delayed_iput(inode);
1228 return 0; 1611 return 0;
1229} 1612}
1230 1613
@@ -1248,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
1248 * if no block got replaced, 0 is returned. if there are other 1631 * if no block got replaced, 0 is returned. if there are other
1249 * errors, a negative error number is returned. 1632 * errors, a negative error number is returned.
1250 */ 1633 */
1251static int replace_path(struct btrfs_trans_handle *trans, 1634static noinline_for_stack
1252 struct btrfs_root *dest, struct btrfs_root *src, 1635int replace_path(struct btrfs_trans_handle *trans,
1253 struct btrfs_path *path, struct btrfs_key *next_key, 1636 struct btrfs_root *dest, struct btrfs_root *src,
1254 struct extent_buffer **leaf, 1637 struct btrfs_path *path, struct btrfs_key *next_key,
1255 int lowest_level, int max_level) 1638 int lowest_level, int max_level)
1256{ 1639{
1257 struct extent_buffer *eb; 1640 struct extent_buffer *eb;
1258 struct extent_buffer *parent; 1641 struct extent_buffer *parent;
@@ -1263,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
1263 u64 new_ptr_gen; 1646 u64 new_ptr_gen;
1264 u64 last_snapshot; 1647 u64 last_snapshot;
1265 u32 blocksize; 1648 u32 blocksize;
1649 int cow = 0;
1266 int level; 1650 int level;
1267 int ret; 1651 int ret;
1268 int slot; 1652 int slot;
1269 1653
1270 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1654 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1271 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1655 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1272 BUG_ON(lowest_level > 1 && leaf);
1273 1656
1274 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1657 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1275 1658again:
1276 slot = path->slots[lowest_level]; 1659 slot = path->slots[lowest_level];
1277 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1660 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1278 1661
@@ -1286,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
1286 return 0; 1669 return 0;
1287 } 1670 }
1288 1671
1289 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1672 if (cow) {
1290 BUG_ON(ret); 1673 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1674 BUG_ON(ret);
1675 }
1291 btrfs_set_lock_blocking(eb); 1676 btrfs_set_lock_blocking(eb);
1292 1677
1293 if (next_key) { 1678 if (next_key) {
@@ -1331,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
1331 1716
1332 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1717 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1333 memcmp_node_keys(parent, slot, path, level)) { 1718 memcmp_node_keys(parent, slot, path, level)) {
1334 if (level <= lowest_level && !leaf) { 1719 if (level <= lowest_level) {
1335 ret = 0; 1720 ret = 0;
1336 break; 1721 break;
1337 } 1722 }
@@ -1339,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
1339 eb = read_tree_block(dest, old_bytenr, blocksize, 1724 eb = read_tree_block(dest, old_bytenr, blocksize,
1340 old_ptr_gen); 1725 old_ptr_gen);
1341 btrfs_tree_lock(eb); 1726 btrfs_tree_lock(eb);
1342 ret = btrfs_cow_block(trans, dest, eb, parent, 1727 if (cow) {
1343 slot, &eb); 1728 ret = btrfs_cow_block(trans, dest, eb, parent,
1344 BUG_ON(ret); 1729 slot, &eb);
1345 btrfs_set_lock_blocking(eb); 1730 BUG_ON(ret);
1346
1347 if (level <= lowest_level) {
1348 *leaf = eb;
1349 ret = 0;
1350 break;
1351 } 1731 }
1732 btrfs_set_lock_blocking(eb);
1352 1733
1353 btrfs_tree_unlock(parent); 1734 btrfs_tree_unlock(parent);
1354 free_extent_buffer(parent); 1735 free_extent_buffer(parent);
@@ -1357,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
1357 continue; 1738 continue;
1358 } 1739 }
1359 1740
1741 if (!cow) {
1742 btrfs_tree_unlock(parent);
1743 free_extent_buffer(parent);
1744 cow = 1;
1745 goto again;
1746 }
1747
1360 btrfs_node_key_to_cpu(path->nodes[level], &key, 1748 btrfs_node_key_to_cpu(path->nodes[level], &key,
1361 path->slots[level]); 1749 path->slots[level]);
1362 btrfs_release_path(src, path); 1750 btrfs_release_path(src, path);
@@ -1562,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1562 return 0; 1950 return 0;
1563} 1951}
1564 1952
1565static void put_inodes(struct list_head *list)
1566{
1567 struct inodevec *ivec;
1568 while (!list_empty(list)) {
1569 ivec = list_entry(list->next, struct inodevec, list);
1570 list_del(&ivec->list);
1571 while (ivec->nr > 0) {
1572 ivec->nr--;
1573 iput(ivec->inode[ivec->nr]);
1574 }
1575 kfree(ivec);
1576 }
1577}
1578
1579static int find_next_key(struct btrfs_path *path, int level, 1953static int find_next_key(struct btrfs_path *path, int level,
1580 struct btrfs_key *key) 1954 struct btrfs_key *key)
1581 1955
@@ -1608,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1608 struct btrfs_root *reloc_root; 1982 struct btrfs_root *reloc_root;
1609 struct btrfs_root_item *root_item; 1983 struct btrfs_root_item *root_item;
1610 struct btrfs_path *path; 1984 struct btrfs_path *path;
1611 struct extent_buffer *leaf = NULL; 1985 struct extent_buffer *leaf;
1612 unsigned long nr; 1986 unsigned long nr;
1613 int level; 1987 int level;
1614 int max_level; 1988 int max_level;
1615 int replaced = 0; 1989 int replaced = 0;
1616 int ret; 1990 int ret;
1617 int err = 0; 1991 int err = 0;
1992 u32 min_reserved;
1618 1993
1619 path = btrfs_alloc_path(); 1994 path = btrfs_alloc_path();
1620 if (!path) 1995 if (!path)
@@ -1648,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1648 btrfs_unlock_up_safe(path, 0); 2023 btrfs_unlock_up_safe(path, 0);
1649 } 2024 }
1650 2025
1651 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { 2026 min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1652 trans = btrfs_start_transaction(root, 1); 2027 memset(&next_key, 0, sizeof(next_key));
1653 2028
1654 leaf = path->nodes[0]; 2029 while (1) {
1655 btrfs_item_key_to_cpu(leaf, &key, 0); 2030 trans = btrfs_start_transaction(root, 0);
1656 btrfs_release_path(reloc_root, path); 2031 trans->block_rsv = rc->block_rsv;
1657 2032
1658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2033 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
1659 if (ret < 0) { 2034 min_reserved, 0);
1660 err = ret; 2035 if (ret) {
1661 goto out; 2036 BUG_ON(ret != -EAGAIN);
2037 ret = btrfs_commit_transaction(trans, root);
2038 BUG_ON(ret);
2039 continue;
1662 } 2040 }
1663 2041
1664 leaf = path->nodes[0];
1665 btrfs_unlock_up_safe(path, 1);
1666 ret = replace_file_extents(trans, rc, root, leaf,
1667 &inode_list);
1668 if (ret < 0)
1669 err = ret;
1670 goto out;
1671 }
1672
1673 memset(&next_key, 0, sizeof(next_key));
1674
1675 while (1) {
1676 leaf = NULL;
1677 replaced = 0; 2042 replaced = 0;
1678 trans = btrfs_start_transaction(root, 1);
1679 max_level = level; 2043 max_level = level;
1680 2044
1681 ret = walk_down_reloc_tree(reloc_root, path, &level); 2045 ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1689 if (!find_next_key(path, level, &key) && 2053 if (!find_next_key(path, level, &key) &&
1690 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2054 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1691 ret = 0; 2055 ret = 0;
1692 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1693 ret = replace_path(trans, root, reloc_root,
1694 path, &next_key, &leaf,
1695 level, max_level);
1696 } else { 2056 } else {
1697 ret = replace_path(trans, root, reloc_root, 2057 ret = replace_path(trans, root, reloc_root, path,
1698 path, &next_key, NULL, 2058 &next_key, level, max_level);
1699 level, max_level);
1700 } 2059 }
1701 if (ret < 0) { 2060 if (ret < 0) {
1702 err = ret; 2061 err = ret;
@@ -1708,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1708 btrfs_node_key_to_cpu(path->nodes[level], &key, 2067 btrfs_node_key_to_cpu(path->nodes[level], &key,
1709 path->slots[level]); 2068 path->slots[level]);
1710 replaced = 1; 2069 replaced = 1;
1711 } else if (leaf) {
1712 /*
1713 * no block got replaced, try replacing file extents
1714 */
1715 btrfs_item_key_to_cpu(leaf, &key, 0);
1716 ret = replace_file_extents(trans, rc, root, leaf,
1717 &inode_list);
1718 btrfs_tree_unlock(leaf);
1719 free_extent_buffer(leaf);
1720 BUG_ON(ret < 0);
1721 } 2070 }
1722 2071
1723 ret = walk_up_reloc_tree(reloc_root, path, &level); 2072 ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1734 root_item->drop_level = level; 2083 root_item->drop_level = level;
1735 2084
1736 nr = trans->blocks_used; 2085 nr = trans->blocks_used;
1737 btrfs_end_transaction(trans, root); 2086 btrfs_end_transaction_throttle(trans, root);
1738 2087
1739 btrfs_btree_balance_dirty(root, nr); 2088 btrfs_btree_balance_dirty(root, nr);
1740 2089
1741 /*
1742 * put inodes outside transaction, otherwise we may deadlock.
1743 */
1744 put_inodes(&inode_list);
1745
1746 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2090 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1747 invalidate_extent_cache(root, &key, &next_key); 2091 invalidate_extent_cache(root, &key, &next_key);
1748 } 2092 }
@@ -1765,87 +2109,125 @@ out:
1765 sizeof(root_item->drop_progress)); 2109 sizeof(root_item->drop_progress));
1766 root_item->drop_level = 0; 2110 root_item->drop_level = 0;
1767 btrfs_set_root_refs(root_item, 0); 2111 btrfs_set_root_refs(root_item, 0);
2112 btrfs_update_reloc_root(trans, root);
1768 } 2113 }
1769 2114
1770 nr = trans->blocks_used; 2115 nr = trans->blocks_used;
1771 btrfs_end_transaction(trans, root); 2116 btrfs_end_transaction_throttle(trans, root);
1772 2117
1773 btrfs_btree_balance_dirty(root, nr); 2118 btrfs_btree_balance_dirty(root, nr);
1774 2119
1775 put_inodes(&inode_list);
1776
1777 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2120 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1778 invalidate_extent_cache(root, &key, &next_key); 2121 invalidate_extent_cache(root, &key, &next_key);
1779 2122
1780 return err; 2123 return err;
1781} 2124}
1782 2125
1783/* 2126static noinline_for_stack
1784 * callback for the work threads. 2127int prepare_to_merge(struct reloc_control *rc, int err)
1785 * this function merges reloc tree with corresponding fs tree,
1786 * and then drops the reloc tree.
1787 */
1788static void merge_func(struct btrfs_work *work)
1789{ 2128{
1790 struct btrfs_trans_handle *trans; 2129 struct btrfs_root *root = rc->extent_root;
1791 struct btrfs_root *root;
1792 struct btrfs_root *reloc_root; 2130 struct btrfs_root *reloc_root;
1793 struct async_merge *async; 2131 struct btrfs_trans_handle *trans;
2132 LIST_HEAD(reloc_roots);
2133 u64 num_bytes = 0;
2134 int ret;
2135 int retries = 0;
2136
2137 mutex_lock(&root->fs_info->trans_mutex);
2138 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2139 rc->merging_rsv_size += rc->nodes_relocated * 2;
2140 mutex_unlock(&root->fs_info->trans_mutex);
2141again:
2142 if (!err) {
2143 num_bytes = rc->merging_rsv_size;
2144 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2145 num_bytes, &retries);
2146 if (ret)
2147 err = ret;
2148 }
2149
2150 trans = btrfs_join_transaction(rc->extent_root, 1);
2151
2152 if (!err) {
2153 if (num_bytes != rc->merging_rsv_size) {
2154 btrfs_end_transaction(trans, rc->extent_root);
2155 btrfs_block_rsv_release(rc->extent_root,
2156 rc->block_rsv, num_bytes);
2157 retries = 0;
2158 goto again;
2159 }
2160 }
1794 2161
1795 async = container_of(work, struct async_merge, work); 2162 rc->merge_reloc_tree = 1;
1796 reloc_root = async->root; 2163
2164 while (!list_empty(&rc->reloc_roots)) {
2165 reloc_root = list_entry(rc->reloc_roots.next,
2166 struct btrfs_root, root_list);
2167 list_del_init(&reloc_root->root_list);
1797 2168
1798 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1799 root = read_fs_root(reloc_root->fs_info, 2169 root = read_fs_root(reloc_root->fs_info,
1800 reloc_root->root_key.offset); 2170 reloc_root->root_key.offset);
1801 BUG_ON(IS_ERR(root)); 2171 BUG_ON(IS_ERR(root));
1802 BUG_ON(root->reloc_root != reloc_root); 2172 BUG_ON(root->reloc_root != reloc_root);
1803 2173
1804 merge_reloc_root(async->rc, root); 2174 /*
1805 2175 * set reference count to 1, so btrfs_recover_relocation
1806 trans = btrfs_start_transaction(root, 1); 2176 * knows it should resumes merging
2177 */
2178 if (!err)
2179 btrfs_set_root_refs(&reloc_root->root_item, 1);
1807 btrfs_update_reloc_root(trans, root); 2180 btrfs_update_reloc_root(trans, root);
1808 btrfs_end_transaction(trans, root);
1809 }
1810 2181
1811 btrfs_drop_snapshot(reloc_root, 0); 2182 list_add(&reloc_root->root_list, &reloc_roots);
2183 }
1812 2184
1813 if (atomic_dec_and_test(async->num_pending)) 2185 list_splice(&reloc_roots, &rc->reloc_roots);
1814 complete(async->done);
1815 2186
1816 kfree(async); 2187 if (!err)
2188 btrfs_commit_transaction(trans, rc->extent_root);
2189 else
2190 btrfs_end_transaction(trans, rc->extent_root);
2191 return err;
1817} 2192}
1818 2193
1819static int merge_reloc_roots(struct reloc_control *rc) 2194static noinline_for_stack
2195int merge_reloc_roots(struct reloc_control *rc)
1820{ 2196{
1821 struct async_merge *async;
1822 struct btrfs_root *root; 2197 struct btrfs_root *root;
1823 struct completion done; 2198 struct btrfs_root *reloc_root;
1824 atomic_t num_pending; 2199 LIST_HEAD(reloc_roots);
2200 int found = 0;
2201 int ret;
2202again:
2203 root = rc->extent_root;
2204 mutex_lock(&root->fs_info->trans_mutex);
2205 list_splice_init(&rc->reloc_roots, &reloc_roots);
2206 mutex_unlock(&root->fs_info->trans_mutex);
1825 2207
1826 init_completion(&done); 2208 while (!list_empty(&reloc_roots)) {
1827 atomic_set(&num_pending, 1); 2209 found = 1;
2210 reloc_root = list_entry(reloc_roots.next,
2211 struct btrfs_root, root_list);
1828 2212
1829 while (!list_empty(&rc->reloc_roots)) { 2213 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1830 root = list_entry(rc->reloc_roots.next, 2214 root = read_fs_root(reloc_root->fs_info,
1831 struct btrfs_root, root_list); 2215 reloc_root->root_key.offset);
1832 list_del_init(&root->root_list); 2216 BUG_ON(IS_ERR(root));
2217 BUG_ON(root->reloc_root != reloc_root);
1833 2218
1834 async = kmalloc(sizeof(*async), GFP_NOFS); 2219 ret = merge_reloc_root(rc, root);
1835 BUG_ON(!async); 2220 BUG_ON(ret);
1836 async->work.func = merge_func; 2221 } else {
1837 async->work.flags = 0; 2222 list_del_init(&reloc_root->root_list);
1838 async->rc = rc; 2223 }
1839 async->root = root; 2224 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
1840 async->done = &done;
1841 async->num_pending = &num_pending;
1842 atomic_inc(&num_pending);
1843 btrfs_queue_worker(&rc->workers, &async->work);
1844 } 2225 }
1845 2226
1846 if (!atomic_dec_and_test(&num_pending)) 2227 if (found) {
1847 wait_for_completion(&done); 2228 found = 0;
1848 2229 goto again;
2230 }
1849 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2231 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1850 return 0; 2232 return 0;
1851} 2233}
@@ -1876,119 +2258,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1876 return btrfs_record_root_in_trans(trans, root); 2258 return btrfs_record_root_in_trans(trans, root);
1877} 2259}
1878 2260
1879/* 2261static noinline_for_stack
1880 * select one tree from trees that references the block. 2262struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1881 * for blocks in refernce counted trees, we preper reloc tree. 2263 struct reloc_control *rc,
1882 * if no reloc tree found and reloc_only is true, NULL is returned. 2264 struct backref_node *node,
1883 */ 2265 struct backref_edge *edges[], int *nr)
1884static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1885 struct backref_node *node,
1886 struct backref_edge *edges[],
1887 int *nr, int reloc_only)
1888{ 2266{
1889 struct backref_node *next; 2267 struct backref_node *next;
1890 struct btrfs_root *root; 2268 struct btrfs_root *root;
1891 int index; 2269 int index = 0;
1892 int loop = 0; 2270
1893again:
1894 index = 0;
1895 next = node; 2271 next = node;
1896 while (1) { 2272 while (1) {
1897 cond_resched(); 2273 cond_resched();
1898 next = walk_up_backref(next, edges, &index); 2274 next = walk_up_backref(next, edges, &index);
1899 root = next->root; 2275 root = next->root;
1900 if (!root) { 2276 BUG_ON(!root);
1901 BUG_ON(!node->old_root); 2277 BUG_ON(!root->ref_cows);
1902 goto skip;
1903 }
1904
1905 /* no other choice for non-refernce counted tree */
1906 if (!root->ref_cows) {
1907 BUG_ON(reloc_only);
1908 break;
1909 }
1910 2278
1911 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2279 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1912 record_reloc_root_in_trans(trans, root); 2280 record_reloc_root_in_trans(trans, root);
1913 break; 2281 break;
1914 } 2282 }
1915 2283
1916 if (loop) { 2284 btrfs_record_root_in_trans(trans, root);
1917 btrfs_record_root_in_trans(trans, root); 2285 root = root->reloc_root;
2286
2287 if (next->new_bytenr != root->node->start) {
2288 BUG_ON(next->new_bytenr);
2289 BUG_ON(!list_empty(&next->list));
2290 next->new_bytenr = root->node->start;
2291 next->root = root;
2292 list_add_tail(&next->list,
2293 &rc->backref_cache.changed);
2294 __mark_block_processed(rc, next);
1918 break; 2295 break;
1919 } 2296 }
1920 2297
1921 if (reloc_only || next != node) { 2298 WARN_ON(1);
1922 if (!root->reloc_root)
1923 btrfs_record_root_in_trans(trans, root);
1924 root = root->reloc_root;
1925 /*
1926 * if the reloc tree was created in current
1927 * transation, there is no node in backref tree
1928 * corresponds to the root of the reloc tree.
1929 */
1930 if (btrfs_root_last_snapshot(&root->root_item) ==
1931 trans->transid - 1)
1932 break;
1933 }
1934skip:
1935 root = NULL; 2299 root = NULL;
1936 next = walk_down_backref(edges, &index); 2300 next = walk_down_backref(edges, &index);
1937 if (!next || next->level <= node->level) 2301 if (!next || next->level <= node->level)
1938 break; 2302 break;
1939 } 2303 }
2304 if (!root)
2305 return NULL;
1940 2306
1941 if (!root && !loop && !reloc_only) { 2307 *nr = index;
1942 loop = 1; 2308 next = node;
1943 goto again; 2309 /* setup backref node path for btrfs_reloc_cow_block */
2310 while (1) {
2311 rc->backref_cache.path[next->level] = next;
2312 if (--index < 0)
2313 break;
2314 next = edges[index]->node[UPPER];
1944 } 2315 }
1945
1946 if (root)
1947 *nr = index;
1948 else
1949 *nr = 0;
1950
1951 return root; 2316 return root;
1952} 2317}
1953 2318
2319/*
2320 * select a tree root for relocation. return NULL if the block
2321 * is reference counted. we should use do_relocation() in this
2322 * case. return a tree root pointer if the block isn't reference
2323 * counted. return -ENOENT if the block is root of reloc tree.
2324 */
1954static noinline_for_stack 2325static noinline_for_stack
1955struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, 2326struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1956 struct backref_node *node) 2327 struct backref_node *node)
1957{ 2328{
2329 struct backref_node *next;
2330 struct btrfs_root *root;
2331 struct btrfs_root *fs_root = NULL;
1958 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2332 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1959 int nr; 2333 int index = 0;
1960 return __select_one_root(trans, node, edges, &nr, 0); 2334
2335 next = node;
2336 while (1) {
2337 cond_resched();
2338 next = walk_up_backref(next, edges, &index);
2339 root = next->root;
2340 BUG_ON(!root);
2341
2342 /* no other choice for non-refernce counted tree */
2343 if (!root->ref_cows)
2344 return root;
2345
2346 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2347 fs_root = root;
2348
2349 if (next != node)
2350 return NULL;
2351
2352 next = walk_down_backref(edges, &index);
2353 if (!next || next->level <= node->level)
2354 break;
2355 }
2356
2357 if (!fs_root)
2358 return ERR_PTR(-ENOENT);
2359 return fs_root;
1961} 2360}
1962 2361
1963static noinline_for_stack 2362static noinline_for_stack
1964struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2363u64 calcu_metadata_size(struct reloc_control *rc,
1965 struct backref_node *node, 2364 struct backref_node *node, int reserve)
1966 struct backref_edge *edges[], int *nr)
1967{ 2365{
1968 return __select_one_root(trans, node, edges, nr, 1); 2366 struct backref_node *next = node;
2367 struct backref_edge *edge;
2368 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2369 u64 num_bytes = 0;
2370 int index = 0;
2371
2372 BUG_ON(reserve && node->processed);
2373
2374 while (next) {
2375 cond_resched();
2376 while (1) {
2377 if (next->processed && (reserve || next != node))
2378 break;
2379
2380 num_bytes += btrfs_level_size(rc->extent_root,
2381 next->level);
2382
2383 if (list_empty(&next->upper))
2384 break;
2385
2386 edge = list_entry(next->upper.next,
2387 struct backref_edge, list[LOWER]);
2388 edges[index++] = edge;
2389 next = edge->node[UPPER];
2390 }
2391 next = walk_down_backref(edges, &index);
2392 }
2393 return num_bytes;
1969} 2394}
1970 2395
1971static void grab_path_buffers(struct btrfs_path *path, 2396static int reserve_metadata_space(struct btrfs_trans_handle *trans,
1972 struct backref_node *node, 2397 struct reloc_control *rc,
1973 struct backref_edge *edges[], int nr) 2398 struct backref_node *node)
1974{ 2399{
1975 int i = 0; 2400 struct btrfs_root *root = rc->extent_root;
1976 while (1) { 2401 u64 num_bytes;
1977 drop_node_buffer(node); 2402 int ret;
1978 node->eb = path->nodes[node->level]; 2403
1979 BUG_ON(!node->eb); 2404 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
1980 if (path->locks[node->level])
1981 node->locked = 1;
1982 path->nodes[node->level] = NULL;
1983 path->locks[node->level] = 0;
1984
1985 if (i >= nr)
1986 break;
1987 2405
1988 edges[i]->blockptr = node->eb->start; 2406 trans->block_rsv = rc->block_rsv;
1989 node = edges[i]->node[UPPER]; 2407 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
1990 i++; 2408 &rc->block_rsv_retries);
2409 if (ret) {
2410 if (ret == -EAGAIN)
2411 rc->commit_transaction = 1;
2412 return ret;
1991 } 2413 }
2414
2415 rc->block_rsv_retries = 0;
2416 return 0;
2417}
2418
2419static void release_metadata_space(struct reloc_control *rc,
2420 struct backref_node *node)
2421{
2422 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2423 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
1992} 2424}
1993 2425
1994/* 2426/*
@@ -1999,6 +2431,7 @@ static void grab_path_buffers(struct btrfs_path *path,
1999 * in that case this function just updates pointers. 2431 * in that case this function just updates pointers.
2000 */ 2432 */
2001static int do_relocation(struct btrfs_trans_handle *trans, 2433static int do_relocation(struct btrfs_trans_handle *trans,
2434 struct reloc_control *rc,
2002 struct backref_node *node, 2435 struct backref_node *node,
2003 struct btrfs_key *key, 2436 struct btrfs_key *key,
2004 struct btrfs_path *path, int lowest) 2437 struct btrfs_path *path, int lowest)
@@ -2019,18 +2452,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2019 BUG_ON(lowest && node->eb); 2452 BUG_ON(lowest && node->eb);
2020 2453
2021 path->lowest_level = node->level + 1; 2454 path->lowest_level = node->level + 1;
2455 rc->backref_cache.path[node->level] = node;
2022 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2456 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2023 cond_resched(); 2457 cond_resched();
2024 if (node->eb && node->eb->start == edge->blockptr)
2025 continue;
2026 2458
2027 upper = edge->node[UPPER]; 2459 upper = edge->node[UPPER];
2028 root = select_reloc_root(trans, upper, edges, &nr); 2460 root = select_reloc_root(trans, rc, upper, edges, &nr);
2029 if (!root) 2461 BUG_ON(!root);
2030 continue; 2462
2031 2463 if (upper->eb && !upper->locked) {
2032 if (upper->eb && !upper->locked) 2464 if (!lowest) {
2465 ret = btrfs_bin_search(upper->eb, key,
2466 upper->level, &slot);
2467 BUG_ON(ret);
2468 bytenr = btrfs_node_blockptr(upper->eb, slot);
2469 if (node->eb->start == bytenr)
2470 goto next;
2471 }
2033 drop_node_buffer(upper); 2472 drop_node_buffer(upper);
2473 }
2034 2474
2035 if (!upper->eb) { 2475 if (!upper->eb) {
2036 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2476 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2480,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2040 } 2480 }
2041 BUG_ON(ret > 0); 2481 BUG_ON(ret > 0);
2042 2482
2043 slot = path->slots[upper->level]; 2483 if (!upper->eb) {
2484 upper->eb = path->nodes[upper->level];
2485 path->nodes[upper->level] = NULL;
2486 } else {
2487 BUG_ON(upper->eb != path->nodes[upper->level]);
2488 }
2044 2489
2045 btrfs_unlock_up_safe(path, upper->level + 1); 2490 upper->locked = 1;
2046 grab_path_buffers(path, upper, edges, nr); 2491 path->locks[upper->level] = 0;
2047 2492
2493 slot = path->slots[upper->level];
2048 btrfs_release_path(NULL, path); 2494 btrfs_release_path(NULL, path);
2049 } else { 2495 } else {
2050 ret = btrfs_bin_search(upper->eb, key, upper->level, 2496 ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2499,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2053 } 2499 }
2054 2500
2055 bytenr = btrfs_node_blockptr(upper->eb, slot); 2501 bytenr = btrfs_node_blockptr(upper->eb, slot);
2056 if (!lowest) { 2502 if (lowest) {
2057 if (node->eb->start == bytenr) { 2503 BUG_ON(bytenr != node->bytenr);
2058 btrfs_tree_unlock(upper->eb);
2059 upper->locked = 0;
2060 continue;
2061 }
2062 } else { 2504 } else {
2063 BUG_ON(node->bytenr != bytenr); 2505 if (node->eb->start == bytenr)
2506 goto next;
2064 } 2507 }
2065 2508
2066 blocksize = btrfs_level_size(root, node->level); 2509 blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2515,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2072 if (!node->eb) { 2515 if (!node->eb) {
2073 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2516 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2074 slot, &eb); 2517 slot, &eb);
2518 btrfs_tree_unlock(eb);
2519 free_extent_buffer(eb);
2075 if (ret < 0) { 2520 if (ret < 0) {
2076 err = ret; 2521 err = ret;
2077 break; 2522 goto next;
2078 } 2523 }
2079 btrfs_set_lock_blocking(eb); 2524 BUG_ON(node->eb != eb);
2080 node->eb = eb;
2081 node->locked = 1;
2082 } else { 2525 } else {
2083 btrfs_set_node_blockptr(upper->eb, slot, 2526 btrfs_set_node_blockptr(upper->eb, slot,
2084 node->eb->start); 2527 node->eb->start);
@@ -2096,67 +2539,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2096 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2539 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2097 BUG_ON(ret); 2540 BUG_ON(ret);
2098 } 2541 }
2099 if (!lowest) { 2542next:
2100 btrfs_tree_unlock(upper->eb); 2543 if (!upper->pending)
2101 upper->locked = 0; 2544 drop_node_buffer(upper);
2102 } 2545 else
2546 unlock_node_buffer(upper);
2547 if (err)
2548 break;
2103 } 2549 }
2550
2551 if (!err && node->pending) {
2552 drop_node_buffer(node);
2553 list_move_tail(&node->list, &rc->backref_cache.changed);
2554 node->pending = 0;
2555 }
2556
2104 path->lowest_level = 0; 2557 path->lowest_level = 0;
2558 BUG_ON(err == -ENOSPC);
2105 return err; 2559 return err;
2106} 2560}
2107 2561
2108static int link_to_upper(struct btrfs_trans_handle *trans, 2562static int link_to_upper(struct btrfs_trans_handle *trans,
2563 struct reloc_control *rc,
2109 struct backref_node *node, 2564 struct backref_node *node,
2110 struct btrfs_path *path) 2565 struct btrfs_path *path)
2111{ 2566{
2112 struct btrfs_key key; 2567 struct btrfs_key key;
2113 if (!node->eb || list_empty(&node->upper))
2114 return 0;
2115 2568
2116 btrfs_node_key_to_cpu(node->eb, &key, 0); 2569 btrfs_node_key_to_cpu(node->eb, &key, 0);
2117 return do_relocation(trans, node, &key, path, 0); 2570 return do_relocation(trans, rc, node, &key, path, 0);
2118} 2571}
2119 2572
2120static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2573static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2121 struct backref_cache *cache, 2574 struct reloc_control *rc,
2122 struct btrfs_path *path) 2575 struct btrfs_path *path, int err)
2123{ 2576{
2577 LIST_HEAD(list);
2578 struct backref_cache *cache = &rc->backref_cache;
2124 struct backref_node *node; 2579 struct backref_node *node;
2125 int level; 2580 int level;
2126 int ret; 2581 int ret;
2127 int err = 0;
2128 2582
2129 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2583 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2130 while (!list_empty(&cache->pending[level])) { 2584 while (!list_empty(&cache->pending[level])) {
2131 node = list_entry(cache->pending[level].next, 2585 node = list_entry(cache->pending[level].next,
2132 struct backref_node, lower); 2586 struct backref_node, list);
2133 BUG_ON(node->level != level); 2587 list_move_tail(&node->list, &list);
2588 BUG_ON(!node->pending);
2134 2589
2135 ret = link_to_upper(trans, node, path); 2590 if (!err) {
2136 if (ret < 0) 2591 ret = link_to_upper(trans, rc, node, path);
2137 err = ret; 2592 if (ret < 0)
2138 /* 2593 err = ret;
2139 * this remove the node from the pending list and 2594 }
2140 * may add some other nodes to the level + 1
2141 * pending list
2142 */
2143 remove_backref_node(cache, node);
2144 } 2595 }
2596 list_splice_init(&list, &cache->pending[level]);
2145 } 2597 }
2146 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2147 return err; 2598 return err;
2148} 2599}
2149 2600
2150static void mark_block_processed(struct reloc_control *rc, 2601static void mark_block_processed(struct reloc_control *rc,
2151 struct backref_node *node) 2602 u64 bytenr, u32 blocksize)
2603{
2604 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2605 EXTENT_DIRTY, GFP_NOFS);
2606}
2607
2608static void __mark_block_processed(struct reloc_control *rc,
2609 struct backref_node *node)
2152{ 2610{
2153 u32 blocksize; 2611 u32 blocksize;
2154 if (node->level == 0 || 2612 if (node->level == 0 ||
2155 in_block_group(node->bytenr, rc->block_group)) { 2613 in_block_group(node->bytenr, rc->block_group)) {
2156 blocksize = btrfs_level_size(rc->extent_root, node->level); 2614 blocksize = btrfs_level_size(rc->extent_root, node->level);
2157 set_extent_bits(&rc->processed_blocks, node->bytenr, 2615 mark_block_processed(rc, node->bytenr, blocksize);
2158 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2159 GFP_NOFS);
2160 } 2616 }
2161 node->processed = 1; 2617 node->processed = 1;
2162} 2618}
@@ -2179,7 +2635,7 @@ static void update_processed_blocks(struct reloc_control *rc,
2179 if (next->processed) 2635 if (next->processed)
2180 break; 2636 break;
2181 2637
2182 mark_block_processed(rc, next); 2638 __mark_block_processed(rc, next);
2183 2639
2184 if (list_empty(&next->upper)) 2640 if (list_empty(&next->upper))
2185 break; 2641 break;
@@ -2202,138 +2658,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2202 return 0; 2658 return 0;
2203} 2659}
2204 2660
2205/*
2206 * check if there are any file extent pointers in the leaf point to
2207 * data require processing
2208 */
2209static int check_file_extents(struct reloc_control *rc,
2210 u64 bytenr, u32 blocksize, u64 ptr_gen)
2211{
2212 struct btrfs_key found_key;
2213 struct btrfs_file_extent_item *fi;
2214 struct extent_buffer *leaf;
2215 u32 nritems;
2216 int i;
2217 int ret = 0;
2218
2219 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2220
2221 nritems = btrfs_header_nritems(leaf);
2222 for (i = 0; i < nritems; i++) {
2223 cond_resched();
2224 btrfs_item_key_to_cpu(leaf, &found_key, i);
2225 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2226 continue;
2227 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2228 if (btrfs_file_extent_type(leaf, fi) ==
2229 BTRFS_FILE_EXTENT_INLINE)
2230 continue;
2231 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2232 if (bytenr == 0)
2233 continue;
2234 if (in_block_group(bytenr, rc->block_group)) {
2235 ret = 1;
2236 break;
2237 }
2238 }
2239 free_extent_buffer(leaf);
2240 return ret;
2241}
2242
2243/*
2244 * scan child blocks of a given block to find blocks require processing
2245 */
2246static int add_child_blocks(struct btrfs_trans_handle *trans,
2247 struct reloc_control *rc,
2248 struct backref_node *node,
2249 struct rb_root *blocks)
2250{
2251 struct tree_block *block;
2252 struct rb_node *rb_node;
2253 u64 bytenr;
2254 u64 ptr_gen;
2255 u32 blocksize;
2256 u32 nritems;
2257 int i;
2258 int err = 0;
2259
2260 nritems = btrfs_header_nritems(node->eb);
2261 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2262 for (i = 0; i < nritems; i++) {
2263 cond_resched();
2264 bytenr = btrfs_node_blockptr(node->eb, i);
2265 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2266 if (ptr_gen == trans->transid)
2267 continue;
2268 if (!in_block_group(bytenr, rc->block_group) &&
2269 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2270 continue;
2271 if (tree_block_processed(bytenr, blocksize, rc))
2272 continue;
2273
2274 readahead_tree_block(rc->extent_root,
2275 bytenr, blocksize, ptr_gen);
2276 }
2277
2278 for (i = 0; i < nritems; i++) {
2279 cond_resched();
2280 bytenr = btrfs_node_blockptr(node->eb, i);
2281 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2282 if (ptr_gen == trans->transid)
2283 continue;
2284 if (!in_block_group(bytenr, rc->block_group) &&
2285 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2286 continue;
2287 if (tree_block_processed(bytenr, blocksize, rc))
2288 continue;
2289 if (!in_block_group(bytenr, rc->block_group) &&
2290 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2291 continue;
2292
2293 block = kmalloc(sizeof(*block), GFP_NOFS);
2294 if (!block) {
2295 err = -ENOMEM;
2296 break;
2297 }
2298 block->bytenr = bytenr;
2299 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2300 block->level = node->level - 1;
2301 block->key_ready = 1;
2302 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2303 BUG_ON(rb_node);
2304 }
2305 if (err)
2306 free_block_list(blocks);
2307 return err;
2308}
2309
2310/*
2311 * find adjacent blocks require processing
2312 */
2313static noinline_for_stack
2314int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2315 struct reloc_control *rc,
2316 struct backref_cache *cache,
2317 struct rb_root *blocks, int level,
2318 struct backref_node **upper)
2319{
2320 struct backref_node *node;
2321 int ret = 0;
2322
2323 WARN_ON(!list_empty(&cache->pending[level]));
2324
2325 if (list_empty(&cache->pending[level + 1]))
2326 return 1;
2327
2328 node = list_entry(cache->pending[level + 1].next,
2329 struct backref_node, lower);
2330 if (node->eb)
2331 ret = add_child_blocks(trans, rc, node, blocks);
2332
2333 *upper = node;
2334 return ret;
2335}
2336
2337static int get_tree_block_key(struct reloc_control *rc, 2661static int get_tree_block_key(struct reloc_control *rc,
2338 struct tree_block *block) 2662 struct tree_block *block)
2339{ 2663{
@@ -2371,40 +2695,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2371 struct btrfs_path *path) 2695 struct btrfs_path *path)
2372{ 2696{
2373 struct btrfs_root *root; 2697 struct btrfs_root *root;
2374 int ret; 2698 int release = 0;
2699 int ret = 0;
2375 2700
2701 if (!node)
2702 return 0;
2703
2704 BUG_ON(node->processed);
2376 root = select_one_root(trans, node); 2705 root = select_one_root(trans, node);
2377 if (unlikely(!root)) { 2706 if (root == ERR_PTR(-ENOENT)) {
2378 rc->found_old_snapshot = 1;
2379 update_processed_blocks(rc, node); 2707 update_processed_blocks(rc, node);
2380 return 0; 2708 goto out;
2381 } 2709 }
2382 2710
2383 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2711 if (!root || root->ref_cows) {
2384 ret = do_relocation(trans, node, key, path, 1); 2712 ret = reserve_metadata_space(trans, rc, node);
2385 if (ret < 0) 2713 if (ret)
2386 goto out;
2387 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2388 ret = replace_file_extents(trans, rc, root,
2389 node->eb, NULL);
2390 if (ret < 0)
2391 goto out;
2392 }
2393 drop_node_buffer(node);
2394 } else if (!root->ref_cows) {
2395 path->lowest_level = node->level;
2396 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2397 btrfs_release_path(root, path);
2398 if (ret < 0)
2399 goto out; 2714 goto out;
2400 } else if (root != node->root) { 2715 release = 1;
2401 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2402 } 2716 }
2403 2717
2404 update_processed_blocks(rc, node); 2718 if (root) {
2405 ret = 0; 2719 if (root->ref_cows) {
2720 BUG_ON(node->new_bytenr);
2721 BUG_ON(!list_empty(&node->list));
2722 btrfs_record_root_in_trans(trans, root);
2723 root = root->reloc_root;
2724 node->new_bytenr = root->node->start;
2725 node->root = root;
2726 list_add_tail(&node->list, &rc->backref_cache.changed);
2727 } else {
2728 path->lowest_level = node->level;
2729 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2730 btrfs_release_path(root, path);
2731 if (ret > 0)
2732 ret = 0;
2733 }
2734 if (!ret)
2735 update_processed_blocks(rc, node);
2736 } else {
2737 ret = do_relocation(trans, rc, node, key, path, 1);
2738 }
2406out: 2739out:
2407 drop_node_buffer(node); 2740 if (ret || node->level == 0 || node->cowonly) {
2741 if (release)
2742 release_metadata_space(rc, node);
2743 remove_backref_node(&rc->backref_cache, node);
2744 }
2408 return ret; 2745 return ret;
2409} 2746}
2410 2747
@@ -2415,12 +2752,10 @@ static noinline_for_stack
2415int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2752int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2416 struct reloc_control *rc, struct rb_root *blocks) 2753 struct reloc_control *rc, struct rb_root *blocks)
2417{ 2754{
2418 struct backref_cache *cache;
2419 struct backref_node *node; 2755 struct backref_node *node;
2420 struct btrfs_path *path; 2756 struct btrfs_path *path;
2421 struct tree_block *block; 2757 struct tree_block *block;
2422 struct rb_node *rb_node; 2758 struct rb_node *rb_node;
2423 int level = -1;
2424 int ret; 2759 int ret;
2425 int err = 0; 2760 int err = 0;
2426 2761
@@ -2428,21 +2763,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2428 if (!path) 2763 if (!path)
2429 return -ENOMEM; 2764 return -ENOMEM;
2430 2765
2431 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2432 if (!cache) {
2433 btrfs_free_path(path);
2434 return -ENOMEM;
2435 }
2436
2437 backref_cache_init(cache);
2438
2439 rb_node = rb_first(blocks); 2766 rb_node = rb_first(blocks);
2440 while (rb_node) { 2767 while (rb_node) {
2441 block = rb_entry(rb_node, struct tree_block, rb_node); 2768 block = rb_entry(rb_node, struct tree_block, rb_node);
2442 if (level == -1)
2443 level = block->level;
2444 else
2445 BUG_ON(level != block->level);
2446 if (!block->key_ready) 2769 if (!block->key_ready)
2447 reada_tree_block(rc, block); 2770 reada_tree_block(rc, block);
2448 rb_node = rb_next(rb_node); 2771 rb_node = rb_next(rb_node);
@@ -2460,7 +2783,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2460 while (rb_node) { 2783 while (rb_node) {
2461 block = rb_entry(rb_node, struct tree_block, rb_node); 2784 block = rb_entry(rb_node, struct tree_block, rb_node);
2462 2785
2463 node = build_backref_tree(rc, cache, &block->key, 2786 node = build_backref_tree(rc, &block->key,
2464 block->level, block->bytenr); 2787 block->level, block->bytenr);
2465 if (IS_ERR(node)) { 2788 if (IS_ERR(node)) {
2466 err = PTR_ERR(node); 2789 err = PTR_ERR(node);
@@ -2470,79 +2793,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2470 ret = relocate_tree_block(trans, rc, node, &block->key, 2793 ret = relocate_tree_block(trans, rc, node, &block->key,
2471 path); 2794 path);
2472 if (ret < 0) { 2795 if (ret < 0) {
2473 err = ret; 2796 if (ret != -EAGAIN || rb_node == rb_first(blocks))
2797 err = ret;
2474 goto out; 2798 goto out;
2475 } 2799 }
2476 remove_backref_node(cache, node);
2477 rb_node = rb_next(rb_node); 2800 rb_node = rb_next(rb_node);
2478 } 2801 }
2479 2802out:
2480 if (level > 0)
2481 goto out;
2482
2483 free_block_list(blocks); 2803 free_block_list(blocks);
2804 err = finish_pending_nodes(trans, rc, path, err);
2484 2805
2485 /* 2806 btrfs_free_path(path);
2486 * now backrefs of some upper level tree blocks have been cached, 2807 return err;
2487 * try relocating blocks referenced by these upper level blocks. 2808}
2488 */
2489 while (1) {
2490 struct backref_node *upper = NULL;
2491 if (trans->transaction->in_commit ||
2492 trans->transaction->delayed_refs.flushing)
2493 break;
2494 2809
2495 ret = add_adjacent_blocks(trans, rc, cache, blocks, level, 2810static noinline_for_stack
2496 &upper); 2811int prealloc_file_extent_cluster(struct inode *inode,
2497 if (ret < 0) 2812 struct file_extent_cluster *cluster)
2498 err = ret; 2813{
2499 if (ret != 0) 2814 u64 alloc_hint = 0;
2500 break; 2815 u64 start;
2816 u64 end;
2817 u64 offset = BTRFS_I(inode)->index_cnt;
2818 u64 num_bytes;
2819 int nr = 0;
2820 int ret = 0;
2501 2821
2502 rb_node = rb_first(blocks); 2822 BUG_ON(cluster->start != cluster->boundary[0]);
2503 while (rb_node) { 2823 mutex_lock(&inode->i_mutex);
2504 block = rb_entry(rb_node, struct tree_block, rb_node);
2505 if (trans->transaction->in_commit ||
2506 trans->transaction->delayed_refs.flushing)
2507 goto out;
2508 BUG_ON(!block->key_ready);
2509 node = build_backref_tree(rc, cache, &block->key,
2510 level, block->bytenr);
2511 if (IS_ERR(node)) {
2512 err = PTR_ERR(node);
2513 goto out;
2514 }
2515 2824
2516 ret = relocate_tree_block(trans, rc, node, 2825 ret = btrfs_check_data_free_space(inode, cluster->end +
2517 &block->key, path); 2826 1 - cluster->start);
2518 if (ret < 0) { 2827 if (ret)
2519 err = ret; 2828 goto out;
2520 goto out;
2521 }
2522 remove_backref_node(cache, node);
2523 rb_node = rb_next(rb_node);
2524 }
2525 free_block_list(blocks);
2526 2829
2527 if (upper) { 2830 while (nr < cluster->nr) {
2528 ret = link_to_upper(trans, upper, path); 2831 start = cluster->boundary[nr] - offset;
2529 if (ret < 0) { 2832 if (nr + 1 < cluster->nr)
2530 err = ret; 2833 end = cluster->boundary[nr + 1] - 1 - offset;
2531 break; 2834 else
2532 } 2835 end = cluster->end - offset;
2533 remove_backref_node(cache, upper); 2836
2534 } 2837 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2838 num_bytes = end + 1 - start;
2839 ret = btrfs_prealloc_file_range(inode, 0, start,
2840 num_bytes, num_bytes,
2841 end + 1, &alloc_hint);
2842 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2843 if (ret)
2844 break;
2845 nr++;
2535 } 2846 }
2847 btrfs_free_reserved_data_space(inode, cluster->end +
2848 1 - cluster->start);
2536out: 2849out:
2537 free_block_list(blocks); 2850 mutex_unlock(&inode->i_mutex);
2538 2851 return ret;
2539 ret = finish_pending_nodes(trans, cache, path);
2540 if (ret < 0)
2541 err = ret;
2542
2543 kfree(cache);
2544 btrfs_free_path(path);
2545 return err;
2546} 2852}
2547 2853
2548static noinline_for_stack 2854static noinline_for_stack
@@ -2588,7 +2894,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
2588 u64 offset = BTRFS_I(inode)->index_cnt; 2894 u64 offset = BTRFS_I(inode)->index_cnt;
2589 unsigned long index; 2895 unsigned long index;
2590 unsigned long last_index; 2896 unsigned long last_index;
2591 unsigned int dirty_page = 0;
2592 struct page *page; 2897 struct page *page;
2593 struct file_ra_state *ra; 2898 struct file_ra_state *ra;
2594 int nr = 0; 2899 int nr = 0;
@@ -2601,21 +2906,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
2601 if (!ra) 2906 if (!ra)
2602 return -ENOMEM; 2907 return -ENOMEM;
2603 2908
2604 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2909 ret = prealloc_file_extent_cluster(inode, cluster);
2605 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2910 if (ret)
2911 goto out;
2606 2912
2607 mutex_lock(&inode->i_mutex); 2913 file_ra_state_init(ra, inode->i_mapping);
2608 2914
2609 i_size_write(inode, cluster->end + 1 - offset);
2610 ret = setup_extent_mapping(inode, cluster->start - offset, 2915 ret = setup_extent_mapping(inode, cluster->start - offset,
2611 cluster->end - offset, cluster->start); 2916 cluster->end - offset, cluster->start);
2612 if (ret) 2917 if (ret)
2613 goto out_unlock; 2918 goto out;
2614
2615 file_ra_state_init(ra, inode->i_mapping);
2616 2919
2617 WARN_ON(cluster->start != cluster->boundary[0]); 2920 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2921 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2618 while (index <= last_index) { 2922 while (index <= last_index) {
2923 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2924 if (ret)
2925 goto out;
2926
2619 page = find_lock_page(inode->i_mapping, index); 2927 page = find_lock_page(inode->i_mapping, index);
2620 if (!page) { 2928 if (!page) {
2621 page_cache_sync_readahead(inode->i_mapping, 2929 page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2931,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2623 last_index + 1 - index); 2931 last_index + 1 - index);
2624 page = grab_cache_page(inode->i_mapping, index); 2932 page = grab_cache_page(inode->i_mapping, index);
2625 if (!page) { 2933 if (!page) {
2934 btrfs_delalloc_release_metadata(inode,
2935 PAGE_CACHE_SIZE);
2626 ret = -ENOMEM; 2936 ret = -ENOMEM;
2627 goto out_unlock; 2937 goto out;
2628 } 2938 }
2629 } 2939 }
2630 2940
@@ -2640,8 +2950,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2640 if (!PageUptodate(page)) { 2950 if (!PageUptodate(page)) {
2641 unlock_page(page); 2951 unlock_page(page);
2642 page_cache_release(page); 2952 page_cache_release(page);
2953 btrfs_delalloc_release_metadata(inode,
2954 PAGE_CACHE_SIZE);
2643 ret = -EIO; 2955 ret = -EIO;
2644 goto out_unlock; 2956 goto out;
2645 } 2957 }
2646 } 2958 }
2647 2959
@@ -2660,10 +2972,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2660 EXTENT_BOUNDARY, GFP_NOFS); 2972 EXTENT_BOUNDARY, GFP_NOFS);
2661 nr++; 2973 nr++;
2662 } 2974 }
2663 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 2975
2976 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2665 set_page_dirty(page); 2977 set_page_dirty(page);
2666 dirty_page++;
2667 2978
2668 unlock_extent(&BTRFS_I(inode)->io_tree, 2979 unlock_extent(&BTRFS_I(inode)->io_tree,
2669 page_start, page_end, GFP_NOFS); 2980 page_start, page_end, GFP_NOFS);
@@ -2671,20 +2982,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
2671 page_cache_release(page); 2982 page_cache_release(page);
2672 2983
2673 index++; 2984 index++;
2674 if (nr < cluster->nr && 2985 balance_dirty_pages_ratelimited(inode->i_mapping);
2675 page_end + 1 + offset == cluster->boundary[nr]) { 2986 btrfs_throttle(BTRFS_I(inode)->root);
2676 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2677 dirty_page);
2678 dirty_page = 0;
2679 }
2680 }
2681 if (dirty_page) {
2682 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2683 dirty_page);
2684 } 2987 }
2685 WARN_ON(nr != cluster->nr); 2988 WARN_ON(nr != cluster->nr);
2686out_unlock: 2989out:
2687 mutex_unlock(&inode->i_mutex);
2688 kfree(ra); 2990 kfree(ra);
2689 return ret; 2991 return ret;
2690} 2992}
@@ -2870,9 +3172,6 @@ out:
2870static int block_use_full_backref(struct reloc_control *rc, 3172static int block_use_full_backref(struct reloc_control *rc,
2871 struct extent_buffer *eb) 3173 struct extent_buffer *eb)
2872{ 3174{
2873 struct btrfs_path *path;
2874 struct btrfs_extent_item *ei;
2875 struct btrfs_key key;
2876 u64 flags; 3175 u64 flags;
2877 int ret; 3176 int ret;
2878 3177
@@ -2880,28 +3179,14 @@ static int block_use_full_backref(struct reloc_control *rc,
2880 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3179 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2881 return 1; 3180 return 1;
2882 3181
2883 path = btrfs_alloc_path(); 3182 ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
2884 BUG_ON(!path); 3183 eb->start, eb->len, NULL, &flags);
2885
2886 key.objectid = eb->start;
2887 key.type = BTRFS_EXTENT_ITEM_KEY;
2888 key.offset = eb->len;
2889
2890 path->search_commit_root = 1;
2891 path->skip_locking = 1;
2892 ret = btrfs_search_slot(NULL, rc->extent_root,
2893 &key, path, 0, 0);
2894 BUG_ON(ret); 3184 BUG_ON(ret);
2895 3185
2896 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2897 struct btrfs_extent_item);
2898 flags = btrfs_extent_flags(path->nodes[0], ei);
2899 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2900 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3186 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2901 ret = 1; 3187 ret = 1;
2902 else 3188 else
2903 ret = 0; 3189 ret = 0;
2904 btrfs_free_path(path);
2905 return ret; 3190 return ret;
2906} 3191}
2907 3192
@@ -3074,22 +3359,10 @@ int add_data_references(struct reloc_control *rc,
3074 struct btrfs_extent_inline_ref *iref; 3359 struct btrfs_extent_inline_ref *iref;
3075 unsigned long ptr; 3360 unsigned long ptr;
3076 unsigned long end; 3361 unsigned long end;
3077 u32 blocksize; 3362 u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3078 int ret; 3363 int ret;
3079 int err = 0; 3364 int err = 0;
3080 3365
3081 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3082 extent_key->offset);
3083 BUG_ON(ret < 0);
3084 if (ret > 0) {
3085 /* the relocated data is fragmented */
3086 rc->extents_skipped++;
3087 btrfs_release_path(rc->extent_root, path);
3088 return 0;
3089 }
3090
3091 blocksize = btrfs_level_size(rc->extent_root, 0);
3092
3093 eb = path->nodes[0]; 3366 eb = path->nodes[0];
3094 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3367 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3095 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3368 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3443,8 @@ int add_data_references(struct reloc_control *rc,
3170 */ 3443 */
3171static noinline_for_stack 3444static noinline_for_stack
3172int find_next_extent(struct btrfs_trans_handle *trans, 3445int find_next_extent(struct btrfs_trans_handle *trans,
3173 struct reloc_control *rc, struct btrfs_path *path) 3446 struct reloc_control *rc, struct btrfs_path *path,
3447 struct btrfs_key *extent_key)
3174{ 3448{
3175 struct btrfs_key key; 3449 struct btrfs_key key;
3176 struct extent_buffer *leaf; 3450 struct extent_buffer *leaf;
@@ -3225,6 +3499,7 @@ next:
3225 rc->search_start = end + 1; 3499 rc->search_start = end + 1;
3226 } else { 3500 } else {
3227 rc->search_start = key.objectid + key.offset; 3501 rc->search_start = key.objectid + key.offset;
3502 memcpy(extent_key, &key, sizeof(key));
3228 return 0; 3503 return 0;
3229 } 3504 }
3230 } 3505 }
@@ -3262,12 +3537,49 @@ static int check_extent_flags(u64 flags)
3262 return 0; 3537 return 0;
3263} 3538}
3264 3539
3540static noinline_for_stack
3541int prepare_to_relocate(struct reloc_control *rc)
3542{
3543 struct btrfs_trans_handle *trans;
3544 int ret;
3545
3546 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
3547 if (!rc->block_rsv)
3548 return -ENOMEM;
3549
3550 /*
3551 * reserve some space for creating reloc trees.
3552 * btrfs_init_reloc_root will use them when there
3553 * is no reservation in transaction handle.
3554 */
3555 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3556 rc->extent_root->nodesize * 256,
3557 &rc->block_rsv_retries);
3558 if (ret)
3559 return ret;
3560
3561 rc->block_rsv->refill_used = 1;
3562 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3563
3564 memset(&rc->cluster, 0, sizeof(rc->cluster));
3565 rc->search_start = rc->block_group->key.objectid;
3566 rc->extents_found = 0;
3567 rc->nodes_relocated = 0;
3568 rc->merging_rsv_size = 0;
3569 rc->block_rsv_retries = 0;
3570
3571 rc->create_reloc_tree = 1;
3572 set_reloc_control(rc);
3573
3574 trans = btrfs_join_transaction(rc->extent_root, 1);
3575 btrfs_commit_transaction(trans, rc->extent_root);
3576 return 0;
3577}
3265 3578
3266static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3579static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3267{ 3580{
3268 struct rb_root blocks = RB_ROOT; 3581 struct rb_root blocks = RB_ROOT;
3269 struct btrfs_key key; 3582 struct btrfs_key key;
3270 struct file_extent_cluster *cluster;
3271 struct btrfs_trans_handle *trans = NULL; 3583 struct btrfs_trans_handle *trans = NULL;
3272 struct btrfs_path *path; 3584 struct btrfs_path *path;
3273 struct btrfs_extent_item *ei; 3585 struct btrfs_extent_item *ei;
@@ -3277,33 +3589,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3277 int ret; 3589 int ret;
3278 int err = 0; 3590 int err = 0;
3279 3591
3280 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3281 if (!cluster)
3282 return -ENOMEM;
3283
3284 path = btrfs_alloc_path(); 3592 path = btrfs_alloc_path();
3285 if (!path) { 3593 if (!path)
3286 kfree(cluster);
3287 return -ENOMEM; 3594 return -ENOMEM;
3288 }
3289
3290 rc->extents_found = 0;
3291 rc->extents_skipped = 0;
3292
3293 rc->search_start = rc->block_group->key.objectid;
3294 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3295 GFP_NOFS);
3296
3297 rc->create_reloc_root = 1;
3298 set_reloc_control(rc);
3299 3595
3300 trans = btrfs_start_transaction(rc->extent_root, 1); 3596 ret = prepare_to_relocate(rc);
3301 btrfs_commit_transaction(trans, rc->extent_root); 3597 if (ret) {
3598 err = ret;
3599 goto out_free;
3600 }
3302 3601
3303 while (1) { 3602 while (1) {
3304 trans = btrfs_start_transaction(rc->extent_root, 1); 3603 trans = btrfs_start_transaction(rc->extent_root, 0);
3604
3605 if (update_backref_cache(trans, &rc->backref_cache)) {
3606 btrfs_end_transaction(trans, rc->extent_root);
3607 continue;
3608 }
3305 3609
3306 ret = find_next_extent(trans, rc, path); 3610 ret = find_next_extent(trans, rc, path, &key);
3307 if (ret < 0) 3611 if (ret < 0)
3308 err = ret; 3612 err = ret;
3309 if (ret != 0) 3613 if (ret != 0)
@@ -3313,9 +3617,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3313 3617
3314 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3618 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3315 struct btrfs_extent_item); 3619 struct btrfs_extent_item);
3316 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3620 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3317 item_size = btrfs_item_size_nr(path->nodes[0],
3318 path->slots[0]);
3319 if (item_size >= sizeof(*ei)) { 3621 if (item_size >= sizeof(*ei)) {
3320 flags = btrfs_extent_flags(path->nodes[0], ei); 3622 flags = btrfs_extent_flags(path->nodes[0], ei);
3321 ret = check_extent_flags(flags); 3623 ret = check_extent_flags(flags);
@@ -3356,73 +3658,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3356 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3658 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3357 ret = add_tree_block(rc, &key, path, &blocks); 3659 ret = add_tree_block(rc, &key, path, &blocks);
3358 } else if (rc->stage == UPDATE_DATA_PTRS && 3660 } else if (rc->stage == UPDATE_DATA_PTRS &&
3359 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3661 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3360 ret = add_data_references(rc, &key, path, &blocks); 3662 ret = add_data_references(rc, &key, path, &blocks);
3361 } else { 3663 } else {
3362 btrfs_release_path(rc->extent_root, path); 3664 btrfs_release_path(rc->extent_root, path);
3363 ret = 0; 3665 ret = 0;
3364 } 3666 }
3365 if (ret < 0) { 3667 if (ret < 0) {
3366 err = 0; 3668 err = ret;
3367 break; 3669 break;
3368 } 3670 }
3369 3671
3370 if (!RB_EMPTY_ROOT(&blocks)) { 3672 if (!RB_EMPTY_ROOT(&blocks)) {
3371 ret = relocate_tree_blocks(trans, rc, &blocks); 3673 ret = relocate_tree_blocks(trans, rc, &blocks);
3372 if (ret < 0) { 3674 if (ret < 0) {
3675 if (ret != -EAGAIN) {
3676 err = ret;
3677 break;
3678 }
3679 rc->extents_found--;
3680 rc->search_start = key.objectid;
3681 }
3682 }
3683
3684 ret = btrfs_block_rsv_check(trans, rc->extent_root,
3685 rc->block_rsv, 0, 5);
3686 if (ret < 0) {
3687 if (ret != -EAGAIN) {
3373 err = ret; 3688 err = ret;
3689 WARN_ON(1);
3374 break; 3690 break;
3375 } 3691 }
3692 rc->commit_transaction = 1;
3376 } 3693 }
3377 3694
3378 nr = trans->blocks_used; 3695 if (rc->commit_transaction) {
3379 btrfs_end_transaction(trans, rc->extent_root); 3696 rc->commit_transaction = 0;
3697 ret = btrfs_commit_transaction(trans, rc->extent_root);
3698 BUG_ON(ret);
3699 } else {
3700 nr = trans->blocks_used;
3701 btrfs_end_transaction_throttle(trans, rc->extent_root);
3702 btrfs_btree_balance_dirty(rc->extent_root, nr);
3703 }
3380 trans = NULL; 3704 trans = NULL;
3381 btrfs_btree_balance_dirty(rc->extent_root, nr);
3382 3705
3383 if (rc->stage == MOVE_DATA_EXTENTS && 3706 if (rc->stage == MOVE_DATA_EXTENTS &&
3384 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3707 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3385 rc->found_file_extent = 1; 3708 rc->found_file_extent = 1;
3386 ret = relocate_data_extent(rc->data_inode, 3709 ret = relocate_data_extent(rc->data_inode,
3387 &key, cluster); 3710 &key, &rc->cluster);
3388 if (ret < 0) { 3711 if (ret < 0) {
3389 err = ret; 3712 err = ret;
3390 break; 3713 break;
3391 } 3714 }
3392 } 3715 }
3393 } 3716 }
3394 btrfs_free_path(path); 3717
3718 btrfs_release_path(rc->extent_root, path);
3719 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3720 GFP_NOFS);
3395 3721
3396 if (trans) { 3722 if (trans) {
3397 nr = trans->blocks_used; 3723 nr = trans->blocks_used;
3398 btrfs_end_transaction(trans, rc->extent_root); 3724 btrfs_end_transaction_throttle(trans, rc->extent_root);
3399 btrfs_btree_balance_dirty(rc->extent_root, nr); 3725 btrfs_btree_balance_dirty(rc->extent_root, nr);
3400 } 3726 }
3401 3727
3402 if (!err) { 3728 if (!err) {
3403 ret = relocate_file_extent_cluster(rc->data_inode, cluster); 3729 ret = relocate_file_extent_cluster(rc->data_inode,
3730 &rc->cluster);
3404 if (ret < 0) 3731 if (ret < 0)
3405 err = ret; 3732 err = ret;
3406 } 3733 }
3407 3734
3408 kfree(cluster); 3735 rc->create_reloc_tree = 0;
3736 set_reloc_control(rc);
3409 3737
3410 rc->create_reloc_root = 0; 3738 backref_cache_cleanup(&rc->backref_cache);
3411 smp_mb(); 3739 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3412 3740
3413 if (rc->extents_found > 0) { 3741 err = prepare_to_merge(rc, err);
3414 trans = btrfs_start_transaction(rc->extent_root, 1);
3415 btrfs_commit_transaction(trans, rc->extent_root);
3416 }
3417 3742
3418 merge_reloc_roots(rc); 3743 merge_reloc_roots(rc);
3419 3744
3745 rc->merge_reloc_tree = 0;
3420 unset_reloc_control(rc); 3746 unset_reloc_control(rc);
3747 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3421 3748
3422 /* get rid of pinned extents */ 3749 /* get rid of pinned extents */
3423 trans = btrfs_start_transaction(rc->extent_root, 1); 3750 trans = btrfs_join_transaction(rc->extent_root, 1);
3424 btrfs_commit_transaction(trans, rc->extent_root); 3751 btrfs_commit_transaction(trans, rc->extent_root);
3425 3752out_free:
3753 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3754 btrfs_free_path(path);
3426 return err; 3755 return err;
3427} 3756}
3428 3757
@@ -3448,7 +3777,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3448 btrfs_set_inode_generation(leaf, item, 1); 3777 btrfs_set_inode_generation(leaf, item, 1);
3449 btrfs_set_inode_size(leaf, item, 0); 3778 btrfs_set_inode_size(leaf, item, 0);
3450 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3779 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3451 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3780 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3781 BTRFS_INODE_PREALLOC);
3452 btrfs_mark_buffer_dirty(leaf); 3782 btrfs_mark_buffer_dirty(leaf);
3453 btrfs_release_path(root, path); 3783 btrfs_release_path(root, path);
3454out: 3784out:
@@ -3460,8 +3790,9 @@ out:
3460 * helper to create inode for data relocation. 3790 * helper to create inode for data relocation.
3461 * the inode is in data relocation tree and its link count is 0 3791 * the inode is in data relocation tree and its link count is 0
3462 */ 3792 */
3463static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3793static noinline_for_stack
3464 struct btrfs_block_group_cache *group) 3794struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3795 struct btrfs_block_group_cache *group)
3465{ 3796{
3466 struct inode *inode = NULL; 3797 struct inode *inode = NULL;
3467 struct btrfs_trans_handle *trans; 3798 struct btrfs_trans_handle *trans;
@@ -3475,8 +3806,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3475 if (IS_ERR(root)) 3806 if (IS_ERR(root))
3476 return ERR_CAST(root); 3807 return ERR_CAST(root);
3477 3808
3478 trans = btrfs_start_transaction(root, 1); 3809 trans = btrfs_start_transaction(root, 6);
3479 BUG_ON(!trans); 3810 if (IS_ERR(trans))
3811 return ERR_CAST(trans);
3480 3812
3481 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3813 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3482 if (err) 3814 if (err)
@@ -3496,7 +3828,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3496out: 3828out:
3497 nr = trans->blocks_used; 3829 nr = trans->blocks_used;
3498 btrfs_end_transaction(trans, root); 3830 btrfs_end_transaction(trans, root);
3499
3500 btrfs_btree_balance_dirty(root, nr); 3831 btrfs_btree_balance_dirty(root, nr);
3501 if (err) { 3832 if (err) {
3502 if (inode) 3833 if (inode)
@@ -3506,6 +3837,21 @@ out:
3506 return inode; 3837 return inode;
3507} 3838}
3508 3839
3840static struct reloc_control *alloc_reloc_control(void)
3841{
3842 struct reloc_control *rc;
3843
3844 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3845 if (!rc)
3846 return NULL;
3847
3848 INIT_LIST_HEAD(&rc->reloc_roots);
3849 backref_cache_init(&rc->backref_cache);
3850 mapping_tree_init(&rc->reloc_root_tree);
3851 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3852 return rc;
3853}
3854
3509/* 3855/*
3510 * function to relocate all extents in a block group. 3856 * function to relocate all extents in a block group.
3511 */ 3857 */
@@ -3514,24 +3860,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3514 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3860 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3515 struct reloc_control *rc; 3861 struct reloc_control *rc;
3516 int ret; 3862 int ret;
3863 int rw = 0;
3517 int err = 0; 3864 int err = 0;
3518 3865
3519 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3866 rc = alloc_reloc_control();
3520 if (!rc) 3867 if (!rc)
3521 return -ENOMEM; 3868 return -ENOMEM;
3522 3869
3523 mapping_tree_init(&rc->reloc_root_tree); 3870 rc->extent_root = extent_root;
3524 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3525 INIT_LIST_HEAD(&rc->reloc_roots);
3526 3871
3527 rc->block_group = btrfs_lookup_block_group(fs_info, group_start); 3872 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3528 BUG_ON(!rc->block_group); 3873 BUG_ON(!rc->block_group);
3529 3874
3530 btrfs_init_workers(&rc->workers, "relocate", 3875 if (!rc->block_group->ro) {
3531 fs_info->thread_pool_size, NULL); 3876 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3532 3877 if (ret) {
3533 rc->extent_root = extent_root; 3878 err = ret;
3534 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3879 goto out;
3880 }
3881 rw = 1;
3882 }
3535 3883
3536 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3884 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3537 if (IS_ERR(rc->data_inode)) { 3885 if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3896,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3548 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 3896 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3549 3897
3550 while (1) { 3898 while (1) {
3551 rc->extents_found = 0;
3552 rc->extents_skipped = 0;
3553
3554 mutex_lock(&fs_info->cleaner_mutex); 3899 mutex_lock(&fs_info->cleaner_mutex);
3555 3900
3556 btrfs_clean_old_snapshots(fs_info->tree_root); 3901 btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3904,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3559 mutex_unlock(&fs_info->cleaner_mutex); 3904 mutex_unlock(&fs_info->cleaner_mutex);
3560 if (ret < 0) { 3905 if (ret < 0) {
3561 err = ret; 3906 err = ret;
3562 break; 3907 goto out;
3563 } 3908 }
3564 3909
3565 if (rc->extents_found == 0) 3910 if (rc->extents_found == 0)
@@ -3573,18 +3918,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3573 invalidate_mapping_pages(rc->data_inode->i_mapping, 3918 invalidate_mapping_pages(rc->data_inode->i_mapping,
3574 0, -1); 3919 0, -1);
3575 rc->stage = UPDATE_DATA_PTRS; 3920 rc->stage = UPDATE_DATA_PTRS;
3576 } else if (rc->stage == UPDATE_DATA_PTRS &&
3577 rc->extents_skipped >= rc->extents_found) {
3578 iput(rc->data_inode);
3579 rc->data_inode = create_reloc_inode(fs_info,
3580 rc->block_group);
3581 if (IS_ERR(rc->data_inode)) {
3582 err = PTR_ERR(rc->data_inode);
3583 rc->data_inode = NULL;
3584 break;
3585 }
3586 rc->stage = MOVE_DATA_EXTENTS;
3587 rc->found_file_extent = 0;
3588 } 3921 }
3589 } 3922 }
3590 3923
@@ -3597,8 +3930,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3597 WARN_ON(rc->block_group->reserved > 0); 3930 WARN_ON(rc->block_group->reserved > 0);
3598 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 3931 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3599out: 3932out:
3933 if (err && rw)
3934 btrfs_set_block_group_rw(extent_root, rc->block_group);
3600 iput(rc->data_inode); 3935 iput(rc->data_inode);
3601 btrfs_stop_workers(&rc->workers);
3602 btrfs_put_block_group(rc->block_group); 3936 btrfs_put_block_group(rc->block_group);
3603 kfree(rc); 3937 kfree(rc);
3604 return err; 3938 return err;
@@ -3609,7 +3943,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3609 struct btrfs_trans_handle *trans; 3943 struct btrfs_trans_handle *trans;
3610 int ret; 3944 int ret;
3611 3945
3612 trans = btrfs_start_transaction(root->fs_info->tree_root, 1); 3946 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
3613 3947
3614 memset(&root->root_item.drop_progress, 0, 3948 memset(&root->root_item.drop_progress, 0,
3615 sizeof(root->root_item.drop_progress)); 3949 sizeof(root->root_item.drop_progress));
@@ -3702,20 +4036,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3702 if (list_empty(&reloc_roots)) 4036 if (list_empty(&reloc_roots))
3703 goto out; 4037 goto out;
3704 4038
3705 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4039 rc = alloc_reloc_control();
3706 if (!rc) { 4040 if (!rc) {
3707 err = -ENOMEM; 4041 err = -ENOMEM;
3708 goto out; 4042 goto out;
3709 } 4043 }
3710 4044
3711 mapping_tree_init(&rc->reloc_root_tree);
3712 INIT_LIST_HEAD(&rc->reloc_roots);
3713 btrfs_init_workers(&rc->workers, "relocate",
3714 root->fs_info->thread_pool_size, NULL);
3715 rc->extent_root = root->fs_info->extent_root; 4045 rc->extent_root = root->fs_info->extent_root;
3716 4046
3717 set_reloc_control(rc); 4047 set_reloc_control(rc);
3718 4048
4049 trans = btrfs_join_transaction(rc->extent_root, 1);
4050
4051 rc->merge_reloc_tree = 1;
4052
3719 while (!list_empty(&reloc_roots)) { 4053 while (!list_empty(&reloc_roots)) {
3720 reloc_root = list_entry(reloc_roots.next, 4054 reloc_root = list_entry(reloc_roots.next,
3721 struct btrfs_root, root_list); 4055 struct btrfs_root, root_list);
@@ -3735,20 +4069,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3735 fs_root->reloc_root = reloc_root; 4069 fs_root->reloc_root = reloc_root;
3736 } 4070 }
3737 4071
3738 trans = btrfs_start_transaction(rc->extent_root, 1);
3739 btrfs_commit_transaction(trans, rc->extent_root); 4072 btrfs_commit_transaction(trans, rc->extent_root);
3740 4073
3741 merge_reloc_roots(rc); 4074 merge_reloc_roots(rc);
3742 4075
3743 unset_reloc_control(rc); 4076 unset_reloc_control(rc);
3744 4077
3745 trans = btrfs_start_transaction(rc->extent_root, 1); 4078 trans = btrfs_join_transaction(rc->extent_root, 1);
3746 btrfs_commit_transaction(trans, rc->extent_root); 4079 btrfs_commit_transaction(trans, rc->extent_root);
3747out: 4080out:
3748 if (rc) { 4081 kfree(rc);
3749 btrfs_stop_workers(&rc->workers);
3750 kfree(rc);
3751 }
3752 while (!list_empty(&reloc_roots)) { 4082 while (!list_empty(&reloc_roots)) {
3753 reloc_root = list_entry(reloc_roots.next, 4083 reloc_root = list_entry(reloc_roots.next,
3754 struct btrfs_root, root_list); 4084 struct btrfs_root, root_list);
@@ -3814,3 +4144,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3814 btrfs_put_ordered_extent(ordered); 4144 btrfs_put_ordered_extent(ordered);
3815 return 0; 4145 return 0;
3816} 4146}
4147
4148void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4149 struct btrfs_root *root, struct extent_buffer *buf,
4150 struct extent_buffer *cow)
4151{
4152 struct reloc_control *rc;
4153 struct backref_node *node;
4154 int first_cow = 0;
4155 int level;
4156 int ret;
4157
4158 rc = root->fs_info->reloc_ctl;
4159 if (!rc)
4160 return;
4161
4162 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4163 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4164
4165 level = btrfs_header_level(buf);
4166 if (btrfs_header_generation(buf) <=
4167 btrfs_root_last_snapshot(&root->root_item))
4168 first_cow = 1;
4169
4170 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4171 rc->create_reloc_tree) {
4172 WARN_ON(!first_cow && level == 0);
4173
4174 node = rc->backref_cache.path[level];
4175 BUG_ON(node->bytenr != buf->start &&
4176 node->new_bytenr != buf->start);
4177
4178 drop_node_buffer(node);
4179 extent_buffer_get(cow);
4180 node->eb = cow;
4181 node->new_bytenr = cow->start;
4182
4183 if (!node->pending) {
4184 list_move_tail(&node->list,
4185 &rc->backref_cache.pending[level]);
4186 node->pending = 1;
4187 }
4188
4189 if (first_cow)
4190 __mark_block_processed(rc, node);
4191
4192 if (first_cow && level > 0)
4193 rc->nodes_relocated += buf->len;
4194 }
4195
4196 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4197 ret = replace_file_extents(trans, rc, root, cow);
4198 BUG_ON(ret);
4199 }
4200}
4201
4202/*
4203 * called before creating snapshot. it calculates metadata reservation
4204 * requried for relocating tree blocks in the snapshot
4205 */
4206void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4207 struct btrfs_pending_snapshot *pending,
4208 u64 *bytes_to_reserve)
4209{
4210 struct btrfs_root *root;
4211 struct reloc_control *rc;
4212
4213 root = pending->root;
4214 if (!root->reloc_root)
4215 return;
4216
4217 rc = root->fs_info->reloc_ctl;
4218 if (!rc->merge_reloc_tree)
4219 return;
4220
4221 root = root->reloc_root;
4222 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4223 /*
4224 * relocation is in the stage of merging trees. the space
4225 * used by merging a reloc tree is twice the size of
4226 * relocated tree nodes in the worst case. half for cowing
4227 * the reloc tree, half for cowing the fs tree. the space
4228 * used by cowing the reloc tree will be freed after the
4229 * tree is dropped. if we create snapshot, cowing the fs
4230 * tree may use more space than it frees. so we need
4231 * reserve extra space.
4232 */
4233 *bytes_to_reserve += rc->nodes_relocated;
4234}
4235
4236/*
4237 * called after snapshot is created. migrate block reservation
4238 * and create reloc root for the newly created snapshot
4239 */
4240void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4241 struct btrfs_pending_snapshot *pending)
4242{
4243 struct btrfs_root *root = pending->root;
4244 struct btrfs_root *reloc_root;
4245 struct btrfs_root *new_root;
4246 struct reloc_control *rc;
4247 int ret;
4248
4249 if (!root->reloc_root)
4250 return;
4251
4252 rc = root->fs_info->reloc_ctl;
4253 rc->merging_rsv_size += rc->nodes_relocated;
4254
4255 if (rc->merge_reloc_tree) {
4256 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4257 rc->block_rsv,
4258 rc->nodes_relocated);
4259 BUG_ON(ret);
4260 }
4261
4262 new_root = pending->snap;
4263 reloc_root = create_reloc_root(trans, root->reloc_root,
4264 new_root->root_key.objectid);
4265
4266 __add_reloc_root(reloc_root);
4267 new_root->reloc_root = reloc_root;
4268
4269 if (rc->create_reloc_tree) {
4270 ret = clone_backref_node(trans, rc, root, reloc_root);
4271 BUG_ON(ret);
4272 }
4273}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..b91ccd972644 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259 struct extent_buffer *leaf; 259 struct extent_buffer *leaf;
260 struct btrfs_path *path; 260 struct btrfs_path *path;
261 struct btrfs_key key; 261 struct btrfs_key key;
262 struct btrfs_key root_key;
263 struct btrfs_root *root;
262 int err = 0; 264 int err = 0;
263 int ret; 265 int ret;
264 266
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
270 key.type = BTRFS_ORPHAN_ITEM_KEY; 272 key.type = BTRFS_ORPHAN_ITEM_KEY;
271 key.offset = 0; 273 key.offset = 0;
272 274
275 root_key.type = BTRFS_ROOT_ITEM_KEY;
276 root_key.offset = (u64)-1;
277
273 while (1) { 278 while (1) {
274 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 279 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
275 if (ret < 0) { 280 if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
294 key.type != BTRFS_ORPHAN_ITEM_KEY) 299 key.type != BTRFS_ORPHAN_ITEM_KEY)
295 break; 300 break;
296 301
297 ret = btrfs_find_dead_roots(tree_root, key.offset); 302 root_key.objectid = key.offset;
298 if (ret) { 303 key.offset++;
304
305 root = btrfs_read_fs_root_no_name(tree_root->fs_info,
306 &root_key);
307 if (!IS_ERR(root))
308 continue;
309
310 ret = PTR_ERR(root);
311 if (ret != -ENOENT) {
299 err = ret; 312 err = ret;
300 break; 313 break;
301 } 314 }
302 315
303 key.offset++; 316 ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
317 if (ret) {
318 err = ret;
319 break;
320 }
304 } 321 }
305 322
306 btrfs_free_path(path); 323 btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff0538e..d34b2dfc9628 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -498,7 +498,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
498 btrfs_start_delalloc_inodes(root, 0); 498 btrfs_start_delalloc_inodes(root, 0);
499 btrfs_wait_ordered_extents(root, 0, 0); 499 btrfs_wait_ordered_extents(root, 0, 0);
500 500
501 trans = btrfs_start_transaction(root, 1); 501 trans = btrfs_start_transaction(root, 0);
502 ret = btrfs_commit_transaction(trans, root); 502 ret = btrfs_commit_transaction(trans, root);
503 return ret; 503 return ret;
504} 504}
@@ -694,11 +694,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
694 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 694 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
695 return -EINVAL; 695 return -EINVAL;
696 696
697 /* recover relocation */ 697 ret = btrfs_cleanup_fs_roots(root->fs_info);
698 ret = btrfs_recover_relocation(root);
699 WARN_ON(ret); 698 WARN_ON(ret);
700 699
701 ret = btrfs_cleanup_fs_roots(root->fs_info); 700 /* recover relocation */
701 ret = btrfs_recover_relocation(root);
702 WARN_ON(ret); 702 WARN_ON(ret);
703 703
704 sb->s_flags &= ~MS_RDONLY; 704 sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
714 struct list_head *head = &root->fs_info->space_info; 714 struct list_head *head = &root->fs_info->space_info;
715 struct btrfs_space_info *found; 715 struct btrfs_space_info *found;
716 u64 total_used = 0; 716 u64 total_used = 0;
717 u64 data_used = 0;
718 int bits = dentry->d_sb->s_blocksize_bits; 717 int bits = dentry->d_sb->s_blocksize_bits;
719 __be32 *fsid = (__be32 *)root->fs_info->fsid; 718 __be32 *fsid = (__be32 *)root->fs_info->fsid;
720 719
721 rcu_read_lock(); 720 rcu_read_lock();
722 list_for_each_entry_rcu(found, head, list) { 721 list_for_each_entry_rcu(found, head, list)
723 if (found->flags & (BTRFS_BLOCK_GROUP_DUP| 722 total_used += found->disk_used;
724 BTRFS_BLOCK_GROUP_RAID10|
725 BTRFS_BLOCK_GROUP_RAID1)) {
726 total_used += found->bytes_used;
727 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
728 data_used += found->bytes_used;
729 else
730 data_used += found->total_bytes;
731 }
732
733 total_used += found->bytes_used;
734 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
735 data_used += found->bytes_used;
736 else
737 data_used += found->total_bytes;
738 }
739 rcu_read_unlock(); 723 rcu_read_unlock();
740 724
741 buf->f_namelen = BTRFS_NAME_LEN; 725 buf->f_namelen = BTRFS_NAME_LEN;
742 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 726 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
743 buf->f_bfree = buf->f_blocks - (total_used >> bits); 727 buf->f_bfree = buf->f_blocks - (total_used >> bits);
744 buf->f_bavail = buf->f_blocks - (data_used >> bits); 728 buf->f_bavail = buf->f_bfree;
745 buf->f_bsize = dentry->d_sb->s_blocksize; 729 buf->f_bsize = dentry->d_sb->s_blocksize;
746 buf->f_type = BTRFS_SUPER_MAGIC; 730 buf->f_type = BTRFS_SUPER_MAGIC;
747 731
@@ -832,11 +816,14 @@ static const struct file_operations btrfs_ctl_fops = {
832}; 816};
833 817
834static struct miscdevice btrfs_misc = { 818static struct miscdevice btrfs_misc = {
835 .minor = MISC_DYNAMIC_MINOR, 819 .minor = BTRFS_MINOR,
836 .name = "btrfs-control", 820 .name = "btrfs-control",
837 .fops = &btrfs_ctl_fops 821 .fops = &btrfs_ctl_fops
838}; 822};
839 823
824MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
825MODULE_ALIAS("devname:btrfs-control");
826
840static int btrfs_interface_init(void) 827static int btrfs_interface_init(void)
841{ 828{
842 return misc_register(&btrfs_misc); 829 return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
165 TRANS_USERSPACE, 165 TRANS_USERSPACE,
166}; 166};
167 167
168static int may_wait_transaction(struct btrfs_root *root, int type)
169{
170 if (!root->fs_info->log_root_recovering &&
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172 type == TRANS_USERSPACE))
173 return 1;
174 return 0;
175}
176
168static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 177static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
169 int num_blocks, int type) 178 u64 num_items, int type)
170{ 179{
171 struct btrfs_trans_handle *h = 180 struct btrfs_trans_handle *h;
172 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
173 int ret; 183 int ret;
184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h)
187 return ERR_PTR(-ENOMEM);
174 188
175 mutex_lock(&root->fs_info->trans_mutex); 189 mutex_lock(&root->fs_info->trans_mutex);
176 if (!root->fs_info->log_root_recovering && 190 if (may_wait_transaction(root, type))
177 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
178 type == TRANS_USERSPACE))
179 wait_current_trans(root); 191 wait_current_trans(root);
192
180 ret = join_transaction(root); 193 ret = join_transaction(root);
181 BUG_ON(ret); 194 BUG_ON(ret);
182 195
183 h->transid = root->fs_info->running_transaction->transid; 196 cur_trans = root->fs_info->running_transaction;
184 h->transaction = root->fs_info->running_transaction; 197 cur_trans->use_count++;
185 h->blocks_reserved = num_blocks; 198 mutex_unlock(&root->fs_info->trans_mutex);
199
200 h->transid = cur_trans->transid;
201 h->transaction = cur_trans;
186 h->blocks_used = 0; 202 h->blocks_used = 0;
187 h->block_group = 0; 203 h->block_group = 0;
188 h->alloc_exclude_nr = 0; 204 h->bytes_reserved = 0;
189 h->alloc_exclude_start = 0;
190 h->delayed_ref_updates = 0; 205 h->delayed_ref_updates = 0;
206 h->block_rsv = NULL;
191 207
192 if (!current->journal_info && type != TRANS_USERSPACE) 208 smp_mb();
193 current->journal_info = h; 209 if (cur_trans->blocked && may_wait_transaction(root, type)) {
210 btrfs_commit_transaction(h, root);
211 goto again;
212 }
213
214 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items,
216 &retries);
217 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root);
219 goto again;
220 }
221 if (ret < 0) {
222 btrfs_end_transaction(h, root);
223 return ERR_PTR(ret);
224 }
225 }
194 226
195 root->fs_info->running_transaction->use_count++; 227 mutex_lock(&root->fs_info->trans_mutex);
196 record_root_in_trans(h, root); 228 record_root_in_trans(h, root);
197 mutex_unlock(&root->fs_info->trans_mutex); 229 mutex_unlock(&root->fs_info->trans_mutex);
230
231 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h;
198 return h; 233 return h;
199} 234}
200 235
201struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 236struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
202 int num_blocks) 237 int num_items)
203{ 238{
204 return start_transaction(root, num_blocks, TRANS_START); 239 return start_transaction(root, num_items, TRANS_START);
205} 240}
206struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
207 int num_blocks) 242 int num_blocks)
208{ 243{
209 return start_transaction(root, num_blocks, TRANS_JOIN); 244 return start_transaction(root, 0, TRANS_JOIN);
210} 245}
211 246
212struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
213 int num_blocks) 248 int num_blocks)
214{ 249{
215 return start_transaction(r, num_blocks, TRANS_USERSPACE); 250 return start_transaction(r, 0, TRANS_USERSPACE);
216} 251}
217 252
218/* wait for a transaction commit to be fully complete */ 253/* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
286 mutex_unlock(&root->fs_info->trans_mutex); 321 mutex_unlock(&root->fs_info->trans_mutex);
287} 322}
288 323
324static int should_end_transaction(struct btrfs_trans_handle *trans,
325 struct btrfs_root *root)
326{
327 int ret;
328 ret = btrfs_block_rsv_check(trans, root,
329 &root->fs_info->global_block_rsv, 0, 5);
330 return ret ? 1 : 0;
331}
332
333int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
334 struct btrfs_root *root)
335{
336 struct btrfs_transaction *cur_trans = trans->transaction;
337 int updates;
338
339 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
340 return 1;
341
342 updates = trans->delayed_ref_updates;
343 trans->delayed_ref_updates = 0;
344 if (updates)
345 btrfs_run_delayed_refs(trans, root, updates);
346
347 return should_end_transaction(trans, root);
348}
349
289static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
290 struct btrfs_root *root, int throttle) 351 struct btrfs_root *root, int throttle)
291{ 352{
292 struct btrfs_transaction *cur_trans; 353 struct btrfs_transaction *cur_trans = trans->transaction;
293 struct btrfs_fs_info *info = root->fs_info; 354 struct btrfs_fs_info *info = root->fs_info;
294 int count = 0; 355 int count = 0;
295 356
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
313 count++; 374 count++;
314 } 375 }
315 376
377 btrfs_trans_release_metadata(trans, root);
378
379 if (!root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1;
382
383 if (cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle)
385 return btrfs_commit_transaction(trans, root);
386 else
387 wake_up_process(info->transaction_kthread);
388 }
389
316 mutex_lock(&info->trans_mutex); 390 mutex_lock(&info->trans_mutex);
317 cur_trans = info->running_transaction; 391 WARN_ON(cur_trans != info->running_transaction);
318 WARN_ON(cur_trans != trans->transaction);
319 WARN_ON(cur_trans->num_writers < 1); 392 WARN_ON(cur_trans->num_writers < 1);
320 cur_trans->num_writers--; 393 cur_trans->num_writers--;
321 394
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
603 676
604 btrfs_free_log(trans, root); 677 btrfs_free_log(trans, root);
605 btrfs_update_reloc_root(trans, root); 678 btrfs_update_reloc_root(trans, root);
679 btrfs_orphan_commit_root(trans, root);
606 680
607 if (root->commit_root != root->node) { 681 if (root->commit_root != root->node) {
608 switch_commit_root(root); 682 switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
627int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 701int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
628{ 702{
629 struct btrfs_fs_info *info = root->fs_info; 703 struct btrfs_fs_info *info = root->fs_info;
630 int ret;
631 struct btrfs_trans_handle *trans; 704 struct btrfs_trans_handle *trans;
705 int ret;
632 unsigned long nr; 706 unsigned long nr;
633 707
634 smp_mb(); 708 if (xchg(&root->defrag_running, 1))
635 if (root->defrag_running)
636 return 0; 709 return 0;
637 trans = btrfs_start_transaction(root, 1); 710
638 while (1) { 711 while (1) {
639 root->defrag_running = 1; 712 trans = btrfs_start_transaction(root, 0);
713 if (IS_ERR(trans))
714 return PTR_ERR(trans);
715
640 ret = btrfs_defrag_leaves(trans, root, cacheonly); 716 ret = btrfs_defrag_leaves(trans, root, cacheonly);
717
641 nr = trans->blocks_used; 718 nr = trans->blocks_used;
642 btrfs_end_transaction(trans, root); 719 btrfs_end_transaction(trans, root);
643 btrfs_btree_balance_dirty(info->tree_root, nr); 720 btrfs_btree_balance_dirty(info->tree_root, nr);
644 cond_resched(); 721 cond_resched();
645 722
646 trans = btrfs_start_transaction(root, 1);
647 if (root->fs_info->closing || ret != -EAGAIN) 723 if (root->fs_info->closing || ret != -EAGAIN)
648 break; 724 break;
649 } 725 }
650 root->defrag_running = 0; 726 root->defrag_running = 0;
651 smp_mb(); 727 return ret;
652 btrfs_end_transaction(trans, root);
653 return 0;
654} 728}
655 729
656#if 0 730#if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
758 struct btrfs_root *root = pending->root; 832 struct btrfs_root *root = pending->root;
759 struct btrfs_root *parent_root; 833 struct btrfs_root *parent_root;
760 struct inode *parent_inode; 834 struct inode *parent_inode;
835 struct dentry *dentry;
761 struct extent_buffer *tmp; 836 struct extent_buffer *tmp;
762 struct extent_buffer *old; 837 struct extent_buffer *old;
763 int ret; 838 int ret;
764 u64 objectid; 839 int retries = 0;
765 int namelen; 840 u64 to_reserve = 0;
766 u64 index = 0; 841 u64 index = 0;
767 842 u64 objectid;
768 parent_inode = pending->dentry->d_parent->d_inode;
769 parent_root = BTRFS_I(parent_inode)->root;
770 843
771 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 844 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
772 if (!new_root_item) { 845 if (!new_root_item) {
773 ret = -ENOMEM; 846 pending->error = -ENOMEM;
774 goto fail; 847 goto fail;
775 } 848 }
849
776 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 850 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
777 if (ret) 851 if (ret) {
852 pending->error = ret;
778 goto fail; 853 goto fail;
854 }
855
856 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
857 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
858
859 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries);
862 if (ret) {
863 pending->error = ret;
864 goto fail;
865 }
866 }
779 867
780 key.objectid = objectid; 868 key.objectid = objectid;
781 /* record when the snapshot was created in key.offset */ 869 key.offset = (u64)-1;
782 key.offset = trans->transid; 870 key.type = BTRFS_ROOT_ITEM_KEY;
783 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
784 871
785 memcpy(&pending->root_key, &key, sizeof(key)); 872 trans->block_rsv = &pending->block_rsv;
786 pending->root_key.offset = (u64)-1;
787 873
874 dentry = pending->dentry;
875 parent_inode = dentry->d_parent->d_inode;
876 parent_root = BTRFS_I(parent_inode)->root;
788 record_root_in_trans(trans, parent_root); 877 record_root_in_trans(trans, parent_root);
878
789 /* 879 /*
790 * insert the directory item 880 * insert the directory item
791 */ 881 */
792 namelen = strlen(pending->name);
793 ret = btrfs_set_inode_index(parent_inode, &index); 882 ret = btrfs_set_inode_index(parent_inode, &index);
794 BUG_ON(ret); 883 BUG_ON(ret);
795 ret = btrfs_insert_dir_item(trans, parent_root, 884 ret = btrfs_insert_dir_item(trans, parent_root,
796 pending->name, namelen, 885 dentry->d_name.name, dentry->d_name.len,
797 parent_inode->i_ino, 886 parent_inode->i_ino, &key,
798 &pending->root_key, BTRFS_FT_DIR, index); 887 BTRFS_FT_DIR, index);
799 BUG_ON(ret); 888 BUG_ON(ret);
800 889
801 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 890 btrfs_i_size_write(parent_inode, parent_inode->i_size +
891 dentry->d_name.len * 2);
802 ret = btrfs_update_inode(trans, parent_root, parent_inode); 892 ret = btrfs_update_inode(trans, parent_root, parent_inode);
803 BUG_ON(ret); 893 BUG_ON(ret);
804 894
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
815 free_extent_buffer(old); 905 free_extent_buffer(old);
816 906
817 btrfs_set_root_node(new_root_item, tmp); 907 btrfs_set_root_node(new_root_item, tmp);
818 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 908 /* record when the snapshot was created in key.offset */
819 new_root_item); 909 key.offset = trans->transid;
820 BUG_ON(ret); 910 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
821 btrfs_tree_unlock(tmp); 911 btrfs_tree_unlock(tmp);
822 free_extent_buffer(tmp); 912 free_extent_buffer(tmp);
913 BUG_ON(ret);
823 914
824 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 915 /*
825 pending->root_key.objectid, 916 * insert root back/forward references
917 */
918 ret = btrfs_add_root_ref(trans, tree_root, objectid,
826 parent_root->root_key.objectid, 919 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name, 920 parent_inode->i_ino, index,
828 namelen); 921 dentry->d_name.name, dentry->d_name.len);
829 BUG_ON(ret); 922 BUG_ON(ret);
830 923
924 key.offset = (u64)-1;
925 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
926 BUG_ON(IS_ERR(pending->snap));
927
928 btrfs_reloc_post_snapshot(trans, pending);
929 btrfs_orphan_post_snapshot(trans, pending);
831fail: 930fail:
832 kfree(new_root_item); 931 kfree(new_root_item);
833 return ret; 932 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
933 return 0;
834} 934}
835 935
836/* 936/*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
878 return ret; 978 return ret;
879} 979}
880 980
981int btrfs_transaction_blocked(struct btrfs_fs_info *info)
982{
983 int ret = 0;
984 spin_lock(&info->new_trans_lock);
985 if (info->running_transaction)
986 ret = info->running_transaction->blocked;
987 spin_unlock(&info->new_trans_lock);
988 return ret;
989}
990
881int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 991int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
882 struct btrfs_root *root) 992 struct btrfs_root *root)
883{ 993{
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
899 ret = btrfs_run_delayed_refs(trans, root, 0); 1009 ret = btrfs_run_delayed_refs(trans, root, 0);
900 BUG_ON(ret); 1010 BUG_ON(ret);
901 1011
1012 btrfs_trans_release_metadata(trans, root);
1013
902 cur_trans = trans->transaction; 1014 cur_trans = trans->transaction;
903 /* 1015 /*
904 * set the flushing flag so procs in this transaction have to 1016 * set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
951 snap_pending = 1; 1063 snap_pending = 1;
952 1064
953 WARN_ON(cur_trans != trans->transaction); 1065 WARN_ON(cur_trans != trans->transaction);
954 prepare_to_wait(&cur_trans->writer_wait, &wait,
955 TASK_UNINTERRUPTIBLE);
956
957 if (cur_trans->num_writers > 1) 1066 if (cur_trans->num_writers > 1)
958 timeout = MAX_SCHEDULE_TIMEOUT; 1067 timeout = MAX_SCHEDULE_TIMEOUT;
959 else if (should_grow) 1068 else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
976 */ 1085 */
977 btrfs_run_ordered_operations(root, 1); 1086 btrfs_run_ordered_operations(root, 1);
978 1087
1088 prepare_to_wait(&cur_trans->writer_wait, &wait,
1089 TASK_UNINTERRUPTIBLE);
1090
979 smp_mb(); 1091 smp_mb();
980 if (cur_trans->num_writers > 1 || should_grow) 1092 if (cur_trans->num_writers > 1 || should_grow)
981 schedule_timeout(timeout); 1093 schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1103 1215
1104 if (btrfs_header_backref_rev(root->node) < 1216 if (btrfs_header_backref_rev(root->node) <
1105 BTRFS_MIXED_BACKREF_REV) 1217 BTRFS_MIXED_BACKREF_REV)
1106 btrfs_drop_snapshot(root, 0); 1218 btrfs_drop_snapshot(root, NULL, 0);
1107 else 1219 else
1108 btrfs_drop_snapshot(root, 1); 1220 btrfs_drop_snapshot(root, NULL, 1);
1109 } 1221 }
1110 return 0; 1222 return 0;
1111} 1223}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
45 45
46struct btrfs_trans_handle { 46struct btrfs_trans_handle {
47 u64 transid; 47 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved;
48 unsigned long blocks_reserved; 50 unsigned long blocks_reserved;
49 unsigned long blocks_used; 51 unsigned long blocks_used;
50 struct btrfs_transaction *transaction;
51 u64 block_group;
52 u64 alloc_exclude_start;
53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates; 52 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv;
55}; 55};
56 56
57struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
58 struct dentry *dentry; 58 struct dentry *dentry;
59 struct btrfs_root *root; 59 struct btrfs_root *root;
60 char *name; 60 struct btrfs_root *snap;
61 struct btrfs_key root_key; 61 /* block reservation for the operation */
62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */
64 int error;
62 struct list_head list; 65 struct list_head list;
63}; 66};
64 67
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
85int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root); 89 struct btrfs_root *root);
87struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
88 int num_blocks); 91 int num_items);
89struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
90 int num_blocks); 93 int num_blocks);
91struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
92 int num_blocks); 95 int num_blocks);
93int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 97 struct btrfs_root *root);
95int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 106 struct btrfs_root *root);
104int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 108 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root);
106void btrfs_throttle(struct btrfs_root *root); 111void btrfs_throttle(struct btrfs_root *root);
107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 112int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 113 struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark); 117 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root, 118int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark); 119 struct extent_io_tree *dirty_pages, int mark);
120int btrfs_transaction_blocked(struct btrfs_fs_info *info);
115int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 121int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
116#endif 122#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
117 path->nodes[1], 0, 117 path->nodes[1], 0,
118 cache_only, &last_ret, 118 cache_only, &last_ret,
119 &root->defrag_progress); 119 &root->defrag_progress);
120 WARN_ON(ret && ret != -EAGAIN); 120 if (ret) {
121 WARN_ON(ret == -EAGAIN);
122 goto out;
123 }
121 if (next_key_ret == 0) { 124 if (next_key_ret == 0) {
122 memcpy(&root->defrag_progress, &key, sizeof(key)); 125 memcpy(&root->defrag_progress, &key, sizeof(key));
123 ret = -EAGAIN; 126 ret = -EAGAIN;
124 } 127 }
125
126 btrfs_release_path(root, path);
127out: 128out:
128 if (path) 129 if (path)
129 btrfs_free_path(path); 130 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
135 struct btrfs_root *root) 135 struct btrfs_root *root)
136{ 136{
137 int ret; 137 int ret;
138 int err = 0;
138 139
139 mutex_lock(&root->log_mutex); 140 mutex_lock(&root->log_mutex);
140 if (root->log_root) { 141 if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
155 mutex_lock(&root->fs_info->tree_log_mutex); 156 mutex_lock(&root->fs_info->tree_log_mutex);
156 if (!root->fs_info->log_root_tree) { 157 if (!root->fs_info->log_root_tree) {
157 ret = btrfs_init_log_root_tree(trans, root->fs_info); 158 ret = btrfs_init_log_root_tree(trans, root->fs_info);
158 BUG_ON(ret); 159 if (ret)
160 err = ret;
159 } 161 }
160 if (!root->log_root) { 162 if (err == 0 && !root->log_root) {
161 ret = btrfs_add_log_tree(trans, root); 163 ret = btrfs_add_log_tree(trans, root);
162 BUG_ON(ret); 164 if (ret)
165 err = ret;
163 } 166 }
164 mutex_unlock(&root->fs_info->tree_log_mutex); 167 mutex_unlock(&root->fs_info->tree_log_mutex);
165 root->log_batch++; 168 root->log_batch++;
166 atomic_inc(&root->log_writers); 169 atomic_inc(&root->log_writers);
167 mutex_unlock(&root->log_mutex); 170 mutex_unlock(&root->log_mutex);
168 return 0; 171 return err;
169} 172}
170 173
171/* 174/*
@@ -376,7 +379,7 @@ insert:
376 BUG_ON(ret); 379 BUG_ON(ret);
377 } 380 }
378 } else if (ret) { 381 } else if (ret) {
379 BUG(); 382 return ret;
380 } 383 }
381 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
382 path->slots[0]); 385 path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1699 1702
1700 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1701 1704
1702 wc->process_func(root, next, wc, ptr_gen);
1703
1704 if (*level == 1) { 1705 if (*level == 1) {
1706 wc->process_func(root, next, wc, ptr_gen);
1707
1705 path->slots[*level]++; 1708 path->slots[*level]++;
1706 if (wc->free) { 1709 if (wc->free) {
1707 btrfs_read_buffer(next, ptr_gen); 1710 btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1734 WARN_ON(*level < 0); 1737 WARN_ON(*level < 0);
1735 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1738 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1736 1739
1737 if (path->nodes[*level] == root->node) 1740 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1738 parent = path->nodes[*level];
1739 else
1740 parent = path->nodes[*level + 1];
1741
1742 bytenr = path->nodes[*level]->start;
1743
1744 blocksize = btrfs_level_size(root, *level);
1745 root_owner = btrfs_header_owner(parent);
1746 root_gen = btrfs_header_generation(parent);
1747
1748 wc->process_func(root, path->nodes[*level], wc,
1749 btrfs_header_generation(path->nodes[*level]));
1750
1751 if (wc->free) {
1752 next = path->nodes[*level];
1753 btrfs_tree_lock(next);
1754 clean_tree_block(trans, root, next);
1755 btrfs_set_lock_blocking(next);
1756 btrfs_wait_tree_block_writeback(next);
1757 btrfs_tree_unlock(next);
1758
1759 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1760 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1761 BUG_ON(ret);
1762 }
1763 free_extent_buffer(path->nodes[*level]);
1764 path->nodes[*level] = NULL;
1765 *level += 1;
1766 1741
1767 cond_resched(); 1742 cond_resched();
1768 return 0; 1743 return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1781 1756
1782 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1783 slot = path->slots[i]; 1758 slot = path->slots[i];
1784 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1785 struct extent_buffer *node; 1760 struct extent_buffer *node;
1786 node = path->nodes[i]; 1761 node = path->nodes[i];
1787 path->slots[i]++; 1762 path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2047 mutex_unlock(&log_root_tree->log_mutex); 2022 mutex_unlock(&log_root_tree->log_mutex);
2048 2023
2049 ret = update_log_root(trans, log); 2024 ret = update_log_root(trans, log);
2050 BUG_ON(ret);
2051 2025
2052 mutex_lock(&log_root_tree->log_mutex); 2026 mutex_lock(&log_root_tree->log_mutex);
2053 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2027 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2056 wake_up(&log_root_tree->log_writer_wait); 2030 wake_up(&log_root_tree->log_writer_wait);
2057 } 2031 }
2058 2032
2033 if (ret) {
2034 BUG_ON(ret != -ENOSPC);
2035 root->fs_info->last_trans_log_full_commit = trans->transid;
2036 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2037 mutex_unlock(&log_root_tree->log_mutex);
2038 ret = -EAGAIN;
2039 goto out;
2040 }
2041
2059 index2 = log_root_tree->log_transid % 2; 2042 index2 = log_root_tree->log_transid % 2;
2060 if (atomic_read(&log_root_tree->log_commit[index2])) { 2043 if (atomic_read(&log_root_tree->log_commit[index2])) {
2061 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2044 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
2129 return 0; 2112 return 0;
2130} 2113}
2131 2114
2132/* 2115static void free_log_tree(struct btrfs_trans_handle *trans,
2133 * free all the extents used by the tree log. This should be called 2116 struct btrfs_root *log)
2134 * at commit time of the full transaction
2135 */
2136int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2137{ 2117{
2138 int ret; 2118 int ret;
2139 struct btrfs_root *log;
2140 struct key;
2141 u64 start; 2119 u64 start;
2142 u64 end; 2120 u64 end;
2143 struct walk_control wc = { 2121 struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2145 .process_func = process_one_buffer 2123 .process_func = process_one_buffer
2146 }; 2124 };
2147 2125
2148 if (!root->log_root || root->fs_info->log_root_recovering)
2149 return 0;
2150
2151 log = root->log_root;
2152 ret = walk_log_tree(trans, log, &wc); 2126 ret = walk_log_tree(trans, log, &wc);
2153 BUG_ON(ret); 2127 BUG_ON(ret);
2154 2128
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2162 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2136 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2163 } 2137 }
2164 2138
2165 if (log->log_transid > 0) {
2166 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2167 &log->root_key);
2168 BUG_ON(ret);
2169 }
2170 root->log_root = NULL;
2171 free_extent_buffer(log->node); 2139 free_extent_buffer(log->node);
2172 kfree(log); 2140 kfree(log);
2141}
2142
2143/*
2144 * free all the extents used by the tree log. This should be called
2145 * at commit time of the full transaction
2146 */
2147int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2148{
2149 if (root->log_root) {
2150 free_log_tree(trans, root->log_root);
2151 root->log_root = NULL;
2152 }
2153 return 0;
2154}
2155
2156int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2157 struct btrfs_fs_info *fs_info)
2158{
2159 if (fs_info->log_root_tree) {
2160 free_log_tree(trans, fs_info->log_root_tree);
2161 fs_info->log_root_tree = NULL;
2162 }
2173 return 0; 2163 return 0;
2174} 2164}
2175 2165
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2203 struct btrfs_dir_item *di; 2193 struct btrfs_dir_item *di;
2204 struct btrfs_path *path; 2194 struct btrfs_path *path;
2205 int ret; 2195 int ret;
2196 int err = 0;
2206 int bytes_del = 0; 2197 int bytes_del = 0;
2207 2198
2208 if (BTRFS_I(dir)->logged_trans < trans->transid) 2199 if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2218 path = btrfs_alloc_path(); 2209 path = btrfs_alloc_path();
2219 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2210 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2220 name, name_len, -1); 2211 name, name_len, -1);
2221 if (di && !IS_ERR(di)) { 2212 if (IS_ERR(di)) {
2213 err = PTR_ERR(di);
2214 goto fail;
2215 }
2216 if (di) {
2222 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2217 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2223 bytes_del += name_len; 2218 bytes_del += name_len;
2224 BUG_ON(ret); 2219 BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2226 btrfs_release_path(log, path); 2221 btrfs_release_path(log, path);
2227 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2228 index, name, name_len, -1); 2223 index, name, name_len, -1);
2229 if (di && !IS_ERR(di)) { 2224 if (IS_ERR(di)) {
2225 err = PTR_ERR(di);
2226 goto fail;
2227 }
2228 if (di) {
2230 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2229 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2231 bytes_del += name_len; 2230 bytes_del += name_len;
2232 BUG_ON(ret); 2231 BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2244 btrfs_release_path(log, path); 2243 btrfs_release_path(log, path);
2245 2244
2246 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2246 if (ret < 0) {
2247 err = ret;
2248 goto fail;
2249 }
2247 if (ret == 0) { 2250 if (ret == 0) {
2248 struct btrfs_inode_item *item; 2251 struct btrfs_inode_item *item;
2249 u64 i_size; 2252 u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2261 ret = 0; 2264 ret = 0;
2262 btrfs_release_path(log, path); 2265 btrfs_release_path(log, path);
2263 } 2266 }
2264 2267fail:
2265 btrfs_free_path(path); 2268 btrfs_free_path(path);
2266 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2269 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2270 if (ret == -ENOSPC) {
2271 root->fs_info->last_trans_log_full_commit = trans->transid;
2272 ret = 0;
2273 }
2267 btrfs_end_log_trans(root); 2274 btrfs_end_log_trans(root);
2268 2275
2269 return 0; 2276 return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2291 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2298 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2292 dirid, &index); 2299 dirid, &index);
2293 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2300 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2301 if (ret == -ENOSPC) {
2302 root->fs_info->last_trans_log_full_commit = trans->transid;
2303 ret = 0;
2304 }
2294 btrfs_end_log_trans(root); 2305 btrfs_end_log_trans(root);
2295 2306
2296 return ret; 2307 return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2318 else 2329 else
2319 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2330 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2320 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2331 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2321 BUG_ON(ret); 2332 if (ret)
2333 return ret;
2322 2334
2323 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2335 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2324 struct btrfs_dir_log_item); 2336 struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2343 struct btrfs_key max_key; 2355 struct btrfs_key max_key;
2344 struct btrfs_root *log = root->log_root; 2356 struct btrfs_root *log = root->log_root;
2345 struct extent_buffer *src; 2357 struct extent_buffer *src;
2358 int err = 0;
2346 int ret; 2359 int ret;
2347 int i; 2360 int i;
2348 int nritems; 2361 int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2405 ret = overwrite_item(trans, log, dst_path, 2418 ret = overwrite_item(trans, log, dst_path,
2406 path->nodes[0], path->slots[0], 2419 path->nodes[0], path->slots[0],
2407 &tmp); 2420 &tmp);
2421 if (ret) {
2422 err = ret;
2423 goto done;
2424 }
2408 } 2425 }
2409 } 2426 }
2410 btrfs_release_path(root, path); 2427 btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2432 goto done; 2449 goto done;
2433 ret = overwrite_item(trans, log, dst_path, src, i, 2450 ret = overwrite_item(trans, log, dst_path, src, i,
2434 &min_key); 2451 &min_key);
2435 BUG_ON(ret); 2452 if (ret) {
2453 err = ret;
2454 goto done;
2455 }
2436 } 2456 }
2437 path->slots[0] = nritems; 2457 path->slots[0] = nritems;
2438 2458
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2454 ret = overwrite_item(trans, log, dst_path, 2474 ret = overwrite_item(trans, log, dst_path,
2455 path->nodes[0], path->slots[0], 2475 path->nodes[0], path->slots[0],
2456 &tmp); 2476 &tmp);
2457 2477 if (ret)
2458 BUG_ON(ret); 2478 err = ret;
2459 last_offset = tmp.offset; 2479 else
2480 last_offset = tmp.offset;
2460 goto done; 2481 goto done;
2461 } 2482 }
2462 } 2483 }
2463done: 2484done:
2464 *last_offset_ret = last_offset;
2465 btrfs_release_path(root, path); 2485 btrfs_release_path(root, path);
2466 btrfs_release_path(log, dst_path); 2486 btrfs_release_path(log, dst_path);
2467 2487
2468 /* insert the log range keys to indicate where the log is valid */ 2488 if (err == 0) {
2469 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2489 *last_offset_ret = last_offset;
2470 first_offset, last_offset); 2490 /*
2471 BUG_ON(ret); 2491 * insert the log range keys to indicate where the log
2472 return 0; 2492 * is valid
2493 */
2494 ret = insert_dir_log_key(trans, log, path, key_type,
2495 inode->i_ino, first_offset,
2496 last_offset);
2497 if (ret)
2498 err = ret;
2499 }
2500 return err;
2473} 2501}
2474 2502
2475/* 2503/*
@@ -2501,7 +2529,8 @@ again:
2501 ret = log_dir_items(trans, root, inode, path, 2529 ret = log_dir_items(trans, root, inode, path,
2502 dst_path, key_type, min_key, 2530 dst_path, key_type, min_key,
2503 &max_key); 2531 &max_key);
2504 BUG_ON(ret); 2532 if (ret)
2533 return ret;
2505 if (max_key == (u64)-1) 2534 if (max_key == (u64)-1)
2506 break; 2535 break;
2507 min_key = max_key + 1; 2536 min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2535 2564
2536 while (1) { 2565 while (1) {
2537 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2566 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2538 2567 BUG_ON(ret == 0);
2539 if (ret != 1) 2568 if (ret < 0)
2540 break; 2569 break;
2541 2570
2542 if (path->slots[0] == 0) 2571 if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2554 btrfs_release_path(log, path); 2583 btrfs_release_path(log, path);
2555 } 2584 }
2556 btrfs_release_path(log, path); 2585 btrfs_release_path(log, path);
2557 return 0; 2586 return ret;
2558} 2587}
2559 2588
2560static noinline int copy_items(struct btrfs_trans_handle *trans, 2589static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2587 } 2616 }
2588 ret = btrfs_insert_empty_items(trans, log, dst_path, 2617 ret = btrfs_insert_empty_items(trans, log, dst_path,
2589 ins_keys, ins_sizes, nr); 2618 ins_keys, ins_sizes, nr);
2590 BUG_ON(ret); 2619 if (ret) {
2620 kfree(ins_data);
2621 return ret;
2622 }
2591 2623
2592 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2624 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2593 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2625 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2660 * we have to do this after the loop above to avoid changing the 2692 * we have to do this after the loop above to avoid changing the
2661 * log tree while trying to change the log tree. 2693 * log tree while trying to change the log tree.
2662 */ 2694 */
2695 ret = 0;
2663 while (!list_empty(&ordered_sums)) { 2696 while (!list_empty(&ordered_sums)) {
2664 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2697 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2665 struct btrfs_ordered_sum, 2698 struct btrfs_ordered_sum,
2666 list); 2699 list);
2667 ret = btrfs_csum_file_blocks(trans, log, sums); 2700 if (!ret)
2668 BUG_ON(ret); 2701 ret = btrfs_csum_file_blocks(trans, log, sums);
2669 list_del(&sums->list); 2702 list_del(&sums->list);
2670 kfree(sums); 2703 kfree(sums);
2671 } 2704 }
2672 return 0; 2705 return ret;
2673} 2706}
2674 2707
2675/* log a single inode in the tree log. 2708/* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2697 struct btrfs_root *log = root->log_root; 2730 struct btrfs_root *log = root->log_root;
2698 struct extent_buffer *src = NULL; 2731 struct extent_buffer *src = NULL;
2699 u32 size; 2732 u32 size;
2733 int err = 0;
2700 int ret; 2734 int ret;
2701 int nritems; 2735 int nritems;
2702 int ins_start_slot = 0; 2736 int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2739 } else { 2773 } else {
2740 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2774 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2741 } 2775 }
2742 BUG_ON(ret); 2776 if (ret) {
2777 err = ret;
2778 goto out_unlock;
2779 }
2743 path->keep_locks = 1; 2780 path->keep_locks = 1;
2744 2781
2745 while (1) { 2782 while (1) {
@@ -2768,7 +2805,10 @@ again:
2768 2805
2769 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2806 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2770 ins_nr, inode_only); 2807 ins_nr, inode_only);
2771 BUG_ON(ret); 2808 if (ret) {
2809 err = ret;
2810 goto out_unlock;
2811 }
2772 ins_nr = 1; 2812 ins_nr = 1;
2773 ins_start_slot = path->slots[0]; 2813 ins_start_slot = path->slots[0];
2774next_slot: 2814next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
2784 ret = copy_items(trans, log, dst_path, src, 2824 ret = copy_items(trans, log, dst_path, src,
2785 ins_start_slot, 2825 ins_start_slot,
2786 ins_nr, inode_only); 2826 ins_nr, inode_only);
2787 BUG_ON(ret); 2827 if (ret) {
2828 err = ret;
2829 goto out_unlock;
2830 }
2788 ins_nr = 0; 2831 ins_nr = 0;
2789 } 2832 }
2790 btrfs_release_path(root, path); 2833 btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
2802 ret = copy_items(trans, log, dst_path, src, 2845 ret = copy_items(trans, log, dst_path, src,
2803 ins_start_slot, 2846 ins_start_slot,
2804 ins_nr, inode_only); 2847 ins_nr, inode_only);
2805 BUG_ON(ret); 2848 if (ret) {
2849 err = ret;
2850 goto out_unlock;
2851 }
2806 ins_nr = 0; 2852 ins_nr = 0;
2807 } 2853 }
2808 WARN_ON(ins_nr); 2854 WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
2810 btrfs_release_path(root, path); 2856 btrfs_release_path(root, path);
2811 btrfs_release_path(log, dst_path); 2857 btrfs_release_path(log, dst_path);
2812 ret = log_directory_changes(trans, root, inode, path, dst_path); 2858 ret = log_directory_changes(trans, root, inode, path, dst_path);
2813 BUG_ON(ret); 2859 if (ret) {
2860 err = ret;
2861 goto out_unlock;
2862 }
2814 } 2863 }
2815 BTRFS_I(inode)->logged_trans = trans->transid; 2864 BTRFS_I(inode)->logged_trans = trans->transid;
2865out_unlock:
2816 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2866 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2817 2867
2818 btrfs_free_path(path); 2868 btrfs_free_path(path);
2819 btrfs_free_path(dst_path); 2869 btrfs_free_path(dst_path);
2820 return 0; 2870 return err;
2821} 2871}
2822 2872
2823/* 2873/*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2942 goto end_no_trans; 2992 goto end_no_trans;
2943 } 2993 }
2944 2994
2945 start_log_trans(trans, root); 2995 ret = start_log_trans(trans, root);
2996 if (ret)
2997 goto end_trans;
2946 2998
2947 ret = btrfs_log_inode(trans, root, inode, inode_only); 2999 ret = btrfs_log_inode(trans, root, inode, inode_only);
2948 BUG_ON(ret); 3000 if (ret)
3001 goto end_trans;
2949 3002
2950 /* 3003 /*
2951 * for regular files, if its inode is already on disk, we don't 3004 * for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2955 */ 3008 */
2956 if (S_ISREG(inode->i_mode) && 3009 if (S_ISREG(inode->i_mode) &&
2957 BTRFS_I(inode)->generation <= last_committed && 3010 BTRFS_I(inode)->generation <= last_committed &&
2958 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3011 BTRFS_I(inode)->last_unlink_trans <= last_committed) {
2959 goto no_parent; 3012 ret = 0;
3013 goto end_trans;
3014 }
2960 3015
2961 inode_only = LOG_INODE_EXISTS; 3016 inode_only = LOG_INODE_EXISTS;
2962 while (1) { 3017 while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2970 if (BTRFS_I(inode)->generation > 3025 if (BTRFS_I(inode)->generation >
2971 root->fs_info->last_trans_committed) { 3026 root->fs_info->last_trans_committed) {
2972 ret = btrfs_log_inode(trans, root, inode, inode_only); 3027 ret = btrfs_log_inode(trans, root, inode, inode_only);
2973 BUG_ON(ret); 3028 if (ret)
3029 goto end_trans;
2974 } 3030 }
2975 if (IS_ROOT(parent)) 3031 if (IS_ROOT(parent))
2976 break; 3032 break;
2977 3033
2978 parent = parent->d_parent; 3034 parent = parent->d_parent;
2979 } 3035 }
2980no_parent:
2981 ret = 0; 3036 ret = 0;
3037end_trans:
3038 if (ret < 0) {
3039 BUG_ON(ret != -ENOSPC);
3040 root->fs_info->last_trans_log_full_commit = trans->transid;
3041 ret = 1;
3042 }
2982 btrfs_end_log_trans(root); 3043 btrfs_end_log_trans(root);
2983end_no_trans: 3044end_no_trans:
2984 return ret; 3045 return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3020 path = btrfs_alloc_path(); 3081 path = btrfs_alloc_path();
3021 BUG_ON(!path); 3082 BUG_ON(!path);
3022 3083
3023 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3084 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3024 3085
3025 wc.trans = trans; 3086 wc.trans = trans;
3026 wc.pin = 1; 3087 wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 26 struct btrfs_root *root);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info);
28int btrfs_recover_log_trees(struct btrfs_root *tree_root); 30int btrfs_recover_log_trees(struct btrfs_root *tree_root);
29int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, struct dentry *dentry); 32 struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1097 if (!path) 1097 if (!path)
1098 return -ENOMEM; 1098 return -ENOMEM;
1099 1099
1100 trans = btrfs_start_transaction(root, 1); 1100 trans = btrfs_start_transaction(root, 0);
1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1102 key.type = BTRFS_DEV_ITEM_KEY; 1102 key.type = BTRFS_DEV_ITEM_KEY;
1103 key.offset = device->devid; 1103 key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1486 goto error; 1486 goto error;
1487 } 1487 }
1488 1488
1489 trans = btrfs_start_transaction(root, 1); 1489 trans = btrfs_start_transaction(root, 0);
1490 lock_chunks(root); 1490 lock_chunks(root);
1491 1491
1492 device->barriers = 1; 1492 device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1751 1751
1752 /* step one, relocate all the extents inside this chunk */ 1752 /* step one, relocate all the extents inside this chunk */
1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1754 BUG_ON(ret); 1754 if (ret)
1755 return ret;
1755 1756
1756 trans = btrfs_start_transaction(root, 1); 1757 trans = btrfs_start_transaction(root, 0);
1757 BUG_ON(!trans); 1758 BUG_ON(!trans);
1758 1759
1759 lock_chunks(root); 1760 lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1925 break; 1926 break;
1926 BUG_ON(ret); 1927 BUG_ON(ret);
1927 1928
1928 trans = btrfs_start_transaction(dev_root, 1); 1929 trans = btrfs_start_transaction(dev_root, 0);
1929 BUG_ON(!trans); 1930 BUG_ON(!trans);
1930 1931
1931 ret = btrfs_grow_device(trans, device, old_size); 1932 ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
2094 } 2095 }
2095 2096
2096 /* Shrinking succeeded, else we would be at "done". */ 2097 /* Shrinking succeeded, else we would be at "done". */
2097 trans = btrfs_start_transaction(root, 1); 2098 trans = btrfs_start_transaction(root, 0);
2098 if (!trans) {
2099 ret = -ENOMEM;
2100 goto done;
2101 }
2102 lock_chunks(root); 2099 lock_chunks(root);
2103 2100
2104 device->disk_total_bytes = new_size; 2101 device->disk_total_bytes = new_size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 59acd3eb288a..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
154 if (trans) 154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags); 155 return do_setxattr(trans, inode, name, value, size, flags);
156 156
157 ret = btrfs_reserve_metadata_space(root, 2); 157 trans = btrfs_start_transaction(root, 2);
158 if (ret) 158 if (IS_ERR(trans))
159 return ret; 159 return PTR_ERR(trans);
160 160
161 trans = btrfs_start_transaction(root, 1);
162 if (!trans) {
163 ret = -ENOMEM;
164 goto out;
165 }
166 btrfs_set_trans_block_group(trans, inode); 161 btrfs_set_trans_block_group(trans, inode);
167 162
168 ret = do_setxattr(trans, inode, name, value, size, flags); 163 ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
174 BUG_ON(ret); 169 BUG_ON(ret);
175out: 170out:
176 btrfs_end_transaction_throttle(trans, root); 171 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
178 return ret; 172 return ret;
179} 173}
180 174
diff --git a/fs/buffer.c b/fs/buffer.c
index e8aa7081d25c..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1949} 1949}
1950 1950
1951/* 1951/*
1952 * block_write_begin takes care of the basic task of block allocation and 1952 * Filesystems implementing the new truncate sequence should use the
1953 * bringing partial write blocks uptodate first. 1953 * _newtrunc postfix variant which won't incorrectly call vmtruncate.
1954 * 1954 * The filesystem needs to handle block truncation upon failure.
1955 * If *pagep is not NULL, then block_write_begin uses the locked page
1956 * at *pagep rather than allocating its own. In this case, the page will
1957 * not be unlocked or deallocated on failure.
1958 */ 1955 */
1959int block_write_begin(struct file *file, struct address_space *mapping, 1956int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
1960 loff_t pos, unsigned len, unsigned flags, 1957 loff_t pos, unsigned len, unsigned flags,
1961 struct page **pagep, void **fsdata, 1958 struct page **pagep, void **fsdata,
1962 get_block_t *get_block) 1959 get_block_t *get_block)
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
1992 unlock_page(page); 1989 unlock_page(page);
1993 page_cache_release(page); 1990 page_cache_release(page);
1994 *pagep = NULL; 1991 *pagep = NULL;
1995
1996 /*
1997 * prepare_write() may have instantiated a few blocks
1998 * outside i_size. Trim these off again. Don't need
1999 * i_size_read because we hold i_mutex.
2000 */
2001 if (pos + len > inode->i_size)
2002 vmtruncate(inode, inode->i_size);
2003 } 1992 }
2004 } 1993 }
2005 1994
2006out: 1995out:
2007 return status; 1996 return status;
2008} 1997}
1998EXPORT_SYMBOL(block_write_begin_newtrunc);
1999
2000/*
2001 * block_write_begin takes care of the basic task of block allocation and
2002 * bringing partial write blocks uptodate first.
2003 *
2004 * If *pagep is not NULL, then block_write_begin uses the locked page
2005 * at *pagep rather than allocating its own. In this case, the page will
2006 * not be unlocked or deallocated on failure.
2007 */
2008int block_write_begin(struct file *file, struct address_space *mapping,
2009 loff_t pos, unsigned len, unsigned flags,
2010 struct page **pagep, void **fsdata,
2011 get_block_t *get_block)
2012{
2013 int ret;
2014
2015 ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
2016 pagep, fsdata, get_block);
2017
2018 /*
2019 * prepare_write() may have instantiated a few blocks
2020 * outside i_size. Trim these off again. Don't need
2021 * i_size_read because we hold i_mutex.
2022 *
2023 * Filesystems which pass down their own page also cannot
2024 * call into vmtruncate here because it would lead to lock
2025 * inversion problems (*pagep is locked). This is a further
2026 * example of where the old truncate sequence is inadequate.
2027 */
2028 if (unlikely(ret) && *pagep == NULL) {
2029 loff_t isize = mapping->host->i_size;
2030 if (pos + len > isize)
2031 vmtruncate(mapping->host, isize);
2032 }
2033
2034 return ret;
2035}
2009EXPORT_SYMBOL(block_write_begin); 2036EXPORT_SYMBOL(block_write_begin);
2010 2037
2011int block_write_end(struct file *file, struct address_space *mapping, 2038int block_write_end(struct file *file, struct address_space *mapping,
@@ -2324,7 +2351,7 @@ out:
2324 * For moronic filesystems that do not allow holes in file. 2351 * For moronic filesystems that do not allow holes in file.
2325 * We may have to extend the file. 2352 * We may have to extend the file.
2326 */ 2353 */
2327int cont_write_begin(struct file *file, struct address_space *mapping, 2354int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2328 loff_t pos, unsigned len, unsigned flags, 2355 loff_t pos, unsigned len, unsigned flags,
2329 struct page **pagep, void **fsdata, 2356 struct page **pagep, void **fsdata,
2330 get_block_t *get_block, loff_t *bytes) 2357 get_block_t *get_block, loff_t *bytes)
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
2345 } 2372 }
2346 2373
2347 *pagep = NULL; 2374 *pagep = NULL;
2348 err = block_write_begin(file, mapping, pos, len, 2375 err = block_write_begin_newtrunc(file, mapping, pos, len,
2349 flags, pagep, fsdata, get_block); 2376 flags, pagep, fsdata, get_block);
2350out: 2377out:
2351 return err; 2378 return err;
2352} 2379}
2380EXPORT_SYMBOL(cont_write_begin_newtrunc);
2381
2382int cont_write_begin(struct file *file, struct address_space *mapping,
2383 loff_t pos, unsigned len, unsigned flags,
2384 struct page **pagep, void **fsdata,
2385 get_block_t *get_block, loff_t *bytes)
2386{
2387 int ret;
2388
2389 ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
2390 pagep, fsdata, get_block, bytes);
2391 if (unlikely(ret)) {
2392 loff_t isize = mapping->host->i_size;
2393 if (pos + len > isize)
2394 vmtruncate(mapping->host, isize);
2395 }
2396
2397 return ret;
2398}
2353EXPORT_SYMBOL(cont_write_begin); 2399EXPORT_SYMBOL(cont_write_begin);
2354 2400
2355int block_prepare_write(struct page *page, unsigned from, unsigned to, 2401int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
2381 * 2427 *
2382 * We are not allowed to take the i_mutex here so we have to play games to 2428 * We are not allowed to take the i_mutex here so we have to play games to
2383 * protect against truncate races as the page could now be beyond EOF. Because 2429 * protect against truncate races as the page could now be beyond EOF. Because
2384 * vmtruncate() writes the inode size before removing pages, once we have the 2430 * truncate writes the inode size before removing pages, once we have the
2385 * page lock we can determine safely if the page is beyond EOF. If it is not 2431 * page lock we can determine safely if the page is beyond EOF. If it is not
2386 * beyond EOF, then the page is guaranteed safe against truncation until we 2432 * beyond EOF, then the page is guaranteed safe against truncation until we
2387 * unlock the page. 2433 * unlock the page.
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2464} 2510}
2465 2511
2466/* 2512/*
2467 * On entry, the page is fully not uptodate. 2513 * Filesystems implementing the new truncate sequence should use the
2468 * On exit the page is fully uptodate in the areas outside (from,to) 2514 * _newtrunc postfix variant which won't incorrectly call vmtruncate.
2515 * The filesystem needs to handle block truncation upon failure.
2469 */ 2516 */
2470int nobh_write_begin(struct file *file, struct address_space *mapping, 2517int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2471 loff_t pos, unsigned len, unsigned flags, 2518 loff_t pos, unsigned len, unsigned flags,
2472 struct page **pagep, void **fsdata, 2519 struct page **pagep, void **fsdata,
2473 get_block_t *get_block) 2520 get_block_t *get_block)
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
2500 unlock_page(page); 2547 unlock_page(page);
2501 page_cache_release(page); 2548 page_cache_release(page);
2502 *pagep = NULL; 2549 *pagep = NULL;
2503 return block_write_begin(file, mapping, pos, len, flags, pagep, 2550 return block_write_begin_newtrunc(file, mapping, pos, len,
2504 fsdata, get_block); 2551 flags, pagep, fsdata, get_block);
2505 } 2552 }
2506 2553
2507 if (PageMappedToDisk(page)) 2554 if (PageMappedToDisk(page))
@@ -2605,8 +2652,34 @@ out_release:
2605 page_cache_release(page); 2652 page_cache_release(page);
2606 *pagep = NULL; 2653 *pagep = NULL;
2607 2654
2608 if (pos + len > inode->i_size) 2655 return ret;
2609 vmtruncate(inode, inode->i_size); 2656}
2657EXPORT_SYMBOL(nobh_write_begin_newtrunc);
2658
2659/*
2660 * On entry, the page is fully not uptodate.
2661 * On exit the page is fully uptodate in the areas outside (from,to)
2662 */
2663int nobh_write_begin(struct file *file, struct address_space *mapping,
2664 loff_t pos, unsigned len, unsigned flags,
2665 struct page **pagep, void **fsdata,
2666 get_block_t *get_block)
2667{
2668 int ret;
2669
2670 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
2671 pagep, fsdata, get_block);
2672
2673 /*
2674 * prepare_write() may have instantiated a few blocks
2675 * outside i_size. Trim these off again. Don't need
2676 * i_size_read because we hold i_mutex.
2677 */
2678 if (unlikely(ret)) {
2679 loff_t isize = mapping->host->i_size;
2680 if (pos + len > isize)
2681 vmtruncate(mapping->host, isize);
2682 }
2610 2683
2611 return ret; 2684 return ret;
2612} 2685}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed4..d9c60b84949a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0; 275 int rc = 0;
276 struct page **pages; 276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset; 277 loff_t offset;
279 u64 len; 278 u64 len;
280 279
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
297 if (rc < 0) 296 if (rc < 0)
298 goto out; 297 goto out;
299 298
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0; 299 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { 300 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page = 301 struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
312 zero_user_segment(page, s, PAGE_CACHE_SIZE); 309 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 } 310 }
314 311
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { 312 if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page); 313 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n", 314 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page); 315 inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
323 flush_dcache_page(page); 320 flush_dcache_page(page);
324 SetPageUptodate(page); 321 SetPageUptodate(page);
325 unlock_page(page); 322 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0) 323 page_cache_release(page);
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 } 324 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0; 325 rc = 0;
331 326
332out: 327out:
@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
568 ceph_release_pages(req->r_pages, req->r_num_pages); 563 ceph_release_pages(req->r_pages, req->r_num_pages);
569 if (req->r_pages_from_pool) 564 if (req->r_pages_from_pool)
570 mempool_free(req->r_pages, 565 mempool_free(req->r_pages,
571 ceph_client(inode->i_sb)->wb_pagevec_pool); 566 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
572 else 567 else
573 kfree(req->r_pages); 568 kfree(req->r_pages);
574 ceph_osdc_put_request(req); 569 ceph_osdc_put_request(req);
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c7..89490beaf537 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,7 +1,6 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h> 4#include <linux/err.h>
6#include <linux/slab.h> 5#include <linux/slab.h>
7 6
@@ -150,7 +149,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
150 149
151 ret = ac->ops->build_request(ac, p + sizeof(u32), end); 150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
152 if (ret < 0) { 151 if (ret < 0) {
153 pr_err("error %d building request\n", ret); 152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret; 154 return ret;
155 } 155 }
156 dout(" built request %d bytes\n", ret); 156 dout(" built request %d bytes\n", ret);
@@ -229,7 +229,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
229 if (ret == -EAGAIN) { 229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len); 230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) { 231 } else if (ret) {
232 pr_err("authentication error %d\n", ret); 232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret; 233 return ret;
234 } 234 }
235 return 0; 235 return 0;
@@ -246,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
246 if (!ac->protocol) 246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len); 247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops); 248 BUG_ON(!ac->ops);
249 if (!ac->ops->is_authenticated(ac)) 249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len); 250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0; 251 return 0;
252} 252}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
15struct ceph_authorizer; 15struct ceph_authorizer;
16 16
17struct ceph_auth_client_ops { 17struct ceph_auth_client_ops {
18 const char *name;
19
18 /* 20 /*
19 * true if we are authenticated and can connect to 21 * true if we are authenticated and can connect to
20 * services. 22 * services.
@@ -22,6 +24,12 @@ struct ceph_auth_client_ops {
22 int (*is_authenticated)(struct ceph_auth_client *ac); 24 int (*is_authenticated)(struct ceph_auth_client *ac);
23 25
24 /* 26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
25 * build requests and process replies during monitor 33 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build 34 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request. 35 * another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f7..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
31 return !xi->starting; 31 return !xi->starting;
32} 32}
33 33
34static int should_authenticate(struct ceph_auth_client *ac)
35{
36 struct ceph_auth_none_info *xi = ac->private;
37
38 return xi->starting;
39}
40
34/* 41/*
35 * the generic auth code decode the global_id, and we carry no actual 42 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here. 43 * authenticate state, so nothing happens here.
@@ -94,9 +101,11 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
94} 101}
95 102
96static const struct ceph_auth_client_ops ceph_auth_none_ops = { 103static const struct ceph_auth_client_ops ceph_auth_none_ops = {
104 .name = "none",
97 .reset = reset, 105 .reset = reset,
98 .destroy = destroy, 106 .destroy = destroy,
99 .is_authenticated = is_authenticated, 107 .is_authenticated = is_authenticated,
108 .should_authenticate = should_authenticate,
100 .handle_reply = handle_reply, 109 .handle_reply = handle_reply,
101 .create_authorizer = ceph_auth_none_create_authorizer, 110 .create_authorizer = ceph_auth_none_create_authorizer,
102 .destroy_authorizer = ceph_auth_none_destroy_authorizer, 111 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da881..83d4d2785ffe 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
27 return (ac->want_keys & xi->have_keys) == ac->want_keys; 27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28} 28}
29 29
30static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31{
32 struct ceph_x_info *xi = ac->private;
33 int need;
34
35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys);
38 return need != 0;
39}
40
30static int ceph_x_encrypt_buflen(int ilen) 41static int ceph_x_encrypt_buflen(int ilen)
31{ 42{
32 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + 43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
@@ -127,7 +138,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
127 int ret; 138 int ret;
128 char *dbuf; 139 char *dbuf;
129 char *ticket_buf; 140 char *ticket_buf;
130 u8 struct_v; 141 u8 reply_struct_v;
131 142
132 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); 143 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
133 if (!dbuf) 144 if (!dbuf)
@@ -139,14 +150,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
139 goto out_dbuf; 150 goto out_dbuf;
140 151
141 ceph_decode_need(&p, end, 1 + sizeof(u32), bad); 152 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
142 struct_v = ceph_decode_8(&p); 153 reply_struct_v = ceph_decode_8(&p);
143 if (struct_v != 1) 154 if (reply_struct_v != 1)
144 goto bad; 155 goto bad;
145 num = ceph_decode_32(&p); 156 num = ceph_decode_32(&p);
146 dout("%d tickets\n", num); 157 dout("%d tickets\n", num);
147 while (num--) { 158 while (num--) {
148 int type; 159 int type;
149 u8 struct_v; 160 u8 tkt_struct_v, blob_struct_v;
150 struct ceph_x_ticket_handler *th; 161 struct ceph_x_ticket_handler *th;
151 void *dp, *dend; 162 void *dp, *dend;
152 int dlen; 163 int dlen;
@@ -165,8 +176,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
165 type = ceph_decode_32(&p); 176 type = ceph_decode_32(&p);
166 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); 177 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
167 178
168 struct_v = ceph_decode_8(&p); 179 tkt_struct_v = ceph_decode_8(&p);
169 if (struct_v != 1) 180 if (tkt_struct_v != 1)
170 goto bad; 181 goto bad;
171 182
172 th = get_ticket_handler(ac, type); 183 th = get_ticket_handler(ac, type);
@@ -186,8 +197,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
186 dend = dbuf + dlen; 197 dend = dbuf + dlen;
187 dp = dbuf; 198 dp = dbuf;
188 199
189 struct_v = ceph_decode_8(&dp); 200 tkt_struct_v = ceph_decode_8(&dp);
190 if (struct_v != 1) 201 if (tkt_struct_v != 1)
191 goto bad; 202 goto bad;
192 203
193 memcpy(&old_key, &th->session_key, sizeof(old_key)); 204 memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +235,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
224 tpend = tp + dlen; 235 tpend = tp + dlen;
225 dout(" ticket blob is %d bytes\n", dlen); 236 dout(" ticket blob is %d bytes\n", dlen);
226 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); 237 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
227 struct_v = ceph_decode_8(&tp); 238 blob_struct_v = ceph_decode_8(&tp);
228 new_secret_id = ceph_decode_64(&tp); 239 new_secret_id = ceph_decode_64(&tp);
229 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); 240 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
230 if (ret) 241 if (ret)
@@ -618,7 +629,9 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
618 629
619 630
620static const struct ceph_auth_client_ops ceph_x_ops = { 631static const struct ceph_auth_client_ops ceph_x_ops = {
632 .name = "x",
621 .is_authenticated = ceph_x_is_authenticated, 633 .is_authenticated = ceph_x_is_authenticated,
634 .should_authenticate = ceph_x_should_authenticate,
622 .build_request = ceph_x_build_request, 635 .build_request = ceph_x_build_request,
623 .handle_reply = ceph_x_handle_reply, 636 .handle_reply = ceph_x_handle_reply,
624 .create_authorizer = ceph_x_create_authorizer, 637 .create_authorizer = ceph_x_create_authorizer,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b279..ae3e3a306445 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
867{ 867{
868 struct ceph_mds_session *session = cap->session; 868 struct ceph_mds_session *session = cap->session;
869 struct ceph_inode_info *ci = cap->ci; 869 struct ceph_inode_info *ci = cap->ci;
870 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 870 struct ceph_mds_client *mdsc =
871 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
871 int removed = 0; 872 int removed = 0;
872 873
873 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 874 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
937 seq, issue_seq, mseq, follows, size, max_size, 938 seq, issue_seq, mseq, follows, size, max_size,
938 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 939 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
939 940
940 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); 941 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
941 if (IS_ERR(msg)) 942 if (!msg)
942 return PTR_ERR(msg); 943 return -ENOMEM;
943 944
944 msg->hdr.tid = cpu_to_le64(flush_tid); 945 msg->hdr.tid = cpu_to_le64(flush_tid);
945 946
@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1298 */ 1299 */
1299void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1300void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1300{ 1301{
1301 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 1302 struct ceph_mds_client *mdsc =
1303 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1302 struct inode *inode = &ci->vfs_inode; 1304 struct inode *inode = &ci->vfs_inode;
1303 int was = ci->i_dirty_caps; 1305 int was = ci->i_dirty_caps;
1304 int dirty = 0; 1306 int dirty = 0;
@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1336static int __mark_caps_flushing(struct inode *inode, 1338static int __mark_caps_flushing(struct inode *inode,
1337 struct ceph_mds_session *session) 1339 struct ceph_mds_session *session)
1338{ 1340{
1339 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1341 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1340 struct ceph_inode_info *ci = ceph_inode(inode); 1342 struct ceph_inode_info *ci = ceph_inode(inode);
1341 int flushing; 1343 int flushing;
1342 1344
@@ -1663,7 +1665,7 @@ ack:
1663static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1665static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1664 unsigned *flush_tid) 1666 unsigned *flush_tid)
1665{ 1667{
1666 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1668 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1667 struct ceph_inode_info *ci = ceph_inode(inode); 1669 struct ceph_inode_info *ci = ceph_inode(inode);
1668 int unlock_session = session ? 0 : 1; 1670 int unlock_session = session ? 0 : 1;
1669 int flushing = 0; 1671 int flushing = 0;
@@ -1716,10 +1718,9 @@ out_unlocked:
1716static int caps_are_flushed(struct inode *inode, unsigned tid) 1718static int caps_are_flushed(struct inode *inode, unsigned tid)
1717{ 1719{
1718 struct ceph_inode_info *ci = ceph_inode(inode); 1720 struct ceph_inode_info *ci = ceph_inode(inode);
1719 int dirty, i, ret = 1; 1721 int i, ret = 1;
1720 1722
1721 spin_lock(&inode->i_lock); 1723 spin_lock(&inode->i_lock);
1722 dirty = __ceph_caps_dirty(ci);
1723 for (i = 0; i < CEPH_CAP_BITS; i++) 1724 for (i = 0; i < CEPH_CAP_BITS; i++)
1724 if ((ci->i_flushing_caps & (1 << i)) && 1725 if ((ci->i_flushing_caps & (1 << i)) &&
1725 ci->i_cap_flush_tid[i] <= tid) { 1726 ci->i_cap_flush_tid[i] <= tid) {
@@ -1775,9 +1776,9 @@ out:
1775 spin_unlock(&ci->i_unsafe_lock); 1776 spin_unlock(&ci->i_unsafe_lock);
1776} 1777}
1777 1778
1778int ceph_fsync(struct file *file, struct dentry *dentry, int datasync) 1779int ceph_fsync(struct file *file, int datasync)
1779{ 1780{
1780 struct inode *inode = dentry->d_inode; 1781 struct inode *inode = file->f_mapping->host;
1781 struct ceph_inode_info *ci = ceph_inode(inode); 1782 struct ceph_inode_info *ci = ceph_inode(inode);
1782 unsigned flush_tid; 1783 unsigned flush_tid;
1783 int ret; 1784 int ret;
@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1829 err = wait_event_interruptible(ci->i_cap_wq, 1830 err = wait_event_interruptible(ci->i_cap_wq,
1830 caps_are_flushed(inode, flush_tid)); 1831 caps_are_flushed(inode, flush_tid));
1831 } else { 1832 } else {
1832 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1833 struct ceph_mds_client *mdsc =
1834 &ceph_sb_to_client(inode->i_sb)->mdsc;
1833 1835
1834 spin_lock(&inode->i_lock); 1836 spin_lock(&inode->i_lock);
1835 if (__ceph_caps_dirty(ci)) 1837 if (__ceph_caps_dirty(ci))
@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2411 __releases(inode->i_lock) 2413 __releases(inode->i_lock)
2412{ 2414{
2413 struct ceph_inode_info *ci = ceph_inode(inode); 2415 struct ceph_inode_info *ci = ceph_inode(inode);
2414 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 2416 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
2415 unsigned seq = le32_to_cpu(m->seq); 2417 unsigned seq = le32_to_cpu(m->seq);
2416 int dirty = le32_to_cpu(m->dirty); 2418 int dirty = le32_to_cpu(m->dirty);
2417 int cleaned = 0; 2419 int cleaned = 0;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..2fa992eaf7da 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
19 * Ceph release version 19 * Ceph release version
20 */ 20 */
21#define CEPH_VERSION_MAJOR 0 21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19 22#define CEPH_VERSION_MINOR 20
23#define CEPH_VERSION_PATCH 0 23#define CEPH_VERSION_PATCH 0
24 24
25#define _CEPH_STRINGIFY(x) #x 25#define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
36 * client-facing protocol. 36 * client-facing protocol.
37 */ 37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ 38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ 39#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */ 40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */ 41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */ 42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
@@ -53,8 +53,18 @@
53/* 53/*
54 * feature bits 54 * feature bits
55 */ 55 */
56#define CEPH_FEATURE_SUPPORTED 0 56#define CEPH_FEATURE_UID 1
57#define CEPH_FEATURE_REQUIRED 0 57#define CEPH_FEATURE_NOSRCADDR 2
58#define CEPH_FEATURE_FLOCK 4
59
60#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
61#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
62#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
63#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
64#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
65#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
66#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
67#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
58 68
59 69
60/* 70/*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
91#define CEPH_AUTH_NONE 0x1 101#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2 102#define CEPH_AUTH_CEPHX 0x2
93 103
104#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
105
94 106
95/********************************************* 107/*********************************************
96 * message layer 108 * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
128#define CEPH_MSG_CLIENT_SNAP 0x312 140#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 141#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130 142
143/* pool ops */
144#define CEPH_MSG_POOLOP_REPLY 48
145#define CEPH_MSG_POOLOP 49
146
147
131/* osd */ 148/* osd */
132#define CEPH_MSG_OSD_MAP 41 149#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42 150#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43 151#define CEPH_MSG_OSD_OPREPLY 43
135 152
153/* pool operations */
154enum {
155 POOL_OP_CREATE = 0x01,
156 POOL_OP_DELETE = 0x02,
157 POOL_OP_AUID_CHANGE = 0x03,
158 POOL_OP_CREATE_SNAP = 0x11,
159 POOL_OP_DELETE_SNAP = 0x12,
160 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
161 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
162};
163
136struct ceph_mon_request_header { 164struct ceph_mon_request_header {
137 __le64 have_version; 165 __le64 have_version;
138 __le16 session_mon; 166 __le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
155 struct ceph_statfs st; 183 struct ceph_statfs st;
156} __attribute__ ((packed)); 184} __attribute__ ((packed));
157 185
186const char *ceph_pool_op_name(int op);
187
188struct ceph_mon_poolop {
189 struct ceph_mon_request_header monhdr;
190 struct ceph_fsid fsid;
191 __le32 pool;
192 __le32 op;
193 __le64 auid;
194 __le64 snapid;
195 __le32 name_len;
196} __attribute__ ((packed));
197
198struct ceph_mon_poolop_reply {
199 struct ceph_mon_request_header monhdr;
200 struct ceph_fsid fsid;
201 __le32 reply_code;
202 __le32 epoch;
203 char has_data;
204 char data[0];
205} __attribute__ ((packed));
206
207struct ceph_mon_unmanaged_snap {
208 __le64 snapid;
209} __attribute__ ((packed));
210
158struct ceph_osd_getmap { 211struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr; 212 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid; 213 struct ceph_fsid fsid;
@@ -212,16 +265,17 @@ extern const char *ceph_mds_state_name(int s);
212 * - they also define the lock ordering by the MDS 265 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds 266 * - a few of these are internal to the mds
214 */ 267 */
215#define CEPH_LOCK_DN 1 268#define CEPH_LOCK_DVERSION 1
216#define CEPH_LOCK_ISNAP 2 269#define CEPH_LOCK_DN 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */ 270#define CEPH_LOCK_ISNAP 16
218#define CEPH_LOCK_IFILE 8 /* mds internal */ 271#define CEPH_LOCK_IVERSION 32 /* mds internal */
219#define CEPH_LOCK_IAUTH 32 272#define CEPH_LOCK_IFILE 64
220#define CEPH_LOCK_ILINK 64 273#define CEPH_LOCK_IAUTH 128
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */ 274#define CEPH_LOCK_ILINK 256
222#define CEPH_LOCK_INEST 256 /* mds internal */ 275#define CEPH_LOCK_IDFT 512 /* dir frag tree */
223#define CEPH_LOCK_IXATTR 512 276#define CEPH_LOCK_INEST 1024 /* mds internal */
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */ 277#define CEPH_LOCK_IXATTR 2048
278#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
225 279
226/* client_session ops */ 280/* client_session ops */
227enum { 281enum {
@@ -308,6 +362,7 @@ union ceph_mds_request_args {
308 struct { 362 struct {
309 __le32 frag; /* which dir fragment */ 363 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */ 364 __le32 max_entries; /* how many dentries to grab */
365 __le32 max_bytes;
311 } __attribute__ ((packed)) readdir; 366 } __attribute__ ((packed)) readdir;
312 struct { 367 struct {
313 __le32 mode; 368 __le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..7503aee828ce 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
10 case CEPH_ENTITY_TYPE_OSD: return "osd"; 10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon"; 11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client"; 12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth"; 13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown"; 14 default: return "unknown";
16 } 15 }
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; 44 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; 45 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr"; 46 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
47 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
48 48
49 case CEPH_OSD_OP_PULL: return "pull"; 49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push"; 50 case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
174 } 174 }
175 return "???"; 175 return "???";
176} 176}
177
178const char *ceph_pool_op_name(int op)
179{
180 switch (op) {
181 case POOL_OP_CREATE: return "create";
182 case POOL_OP_DELETE: return "delete";
183 case POOL_OP_AUID_CHANGE: return "auid change";
184 case POOL_OP_CREATE_SNAP: return "create snap";
185 case POOL_OP_DELETE_SNAP: return "delete snap";
186 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
187 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
188 }
189 return "???";
190}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92acc..3be33fb066cc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
113static int monc_show(struct seq_file *s, void *p) 113static int monc_show(struct seq_file *s, void *p)
114{ 114{
115 struct ceph_client *client = s->private; 115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req; 116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc; 117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp; 118 struct rb_node *rp;
119 119
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
126 if (monc->want_next_osdmap) 126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n"); 127 seq_printf(s, "want next osdmap\n");
128 128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { 129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node); 130 __u16 op;
131 seq_printf(s, "%lld statfs\n", req->tid); 131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
132 } 137 }
133 138
134 mutex_unlock(&monc->mutex); 139 mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed26..f85719310db2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
51 return -ENOMEM; /* oh well */ 51 return -ENOMEM; /* oh well */
52 52
53 spin_lock(&dentry->d_lock); 53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */ 54 if (dentry->d_fsdata) {
55 /* lost a race */
56 kmem_cache_free(ceph_dentry_cachep, di);
55 goto out_unlock; 57 goto out_unlock;
58 }
56 di->dentry = dentry; 59 di->dentry = dentry;
57 di->lease_session = NULL; 60 di->lease_session = NULL;
58 dentry->d_fsdata = di; 61 dentry->d_fsdata = di;
@@ -125,7 +128,8 @@ more:
125 dentry = list_entry(p, struct dentry, d_u.d_child); 128 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry); 129 di = ceph_dentry(dentry);
127 while (1) { 130 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, 131 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
132 d_unhashed(dentry) ? "!hashed" : "hashed",
129 parent->d_subdirs.prev, parent->d_subdirs.next); 133 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) { 134 if (p == &parent->d_subdirs) {
131 fi->at_end = 1; 135 fi->at_end = 1;
@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
229 u32 ftype; 233 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo; 234 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir; 235 const int max_entries = client->mount_args->max_readdir;
236 const int max_bytes = client->mount_args->max_readdir_bytes;
232 237
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 238 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end) 239 if (fi->at_end)
@@ -312,6 +317,7 @@ more:
312 req->r_readdir_offset = fi->next_offset; 317 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag); 318 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 319 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
320 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
315 req->r_num_caps = max_entries + 1; 321 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req); 322 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) { 323 if (err < 0) {
@@ -335,7 +341,7 @@ more:
335 if (req->r_reply_info.dir_end) { 341 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name); 342 kfree(fi->last_name);
337 fi->last_name = NULL; 343 fi->last_name = NULL;
338 fi->next_offset = 0; 344 fi->next_offset = 2;
339 } else { 345 } else {
340 rinfo = &req->r_reply_info; 346 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi, 347 err = note_last_dentry(fi,
@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 484struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err) 485 struct dentry *dentry, int err)
480{ 486{
481 struct ceph_client *client = ceph_client(dentry->d_sb); 487 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode; 488 struct inode *parent = dentry->d_parent->d_inode;
483 489
484 /* .snap dir? */ 490 /* .snap dir? */
@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
568 !is_root_ceph_dentry(dir, dentry) && 574 !is_root_ceph_dentry(dir, dentry) &&
569 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 575 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
570 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 576 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
571 di->offset = ci->i_max_offset++;
572 spin_unlock(&dir->i_lock); 577 spin_unlock(&dir->i_lock);
573 dout(" dir %p complete, -ENOENT\n", dir); 578 dout(" dir %p complete, -ENOENT\n", dir);
574 d_add(dentry, NULL); 579 d_add(dentry, NULL);
@@ -582,7 +587,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
582 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 587 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
583 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 588 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
584 if (IS_ERR(req)) 589 if (IS_ERR(req))
585 return ERR_PTR(PTR_ERR(req)); 590 return ERR_CAST(req);
586 req->r_dentry = dget(dentry); 591 req->r_dentry = dget(dentry);
587 req->r_num_caps = 2; 592 req->r_num_caps = 2;
588 /* we only need inode linkage */ 593 /* we only need inode linkage */
@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
888 893
889 /* ensure target dentry is invalidated, despite 894 /* ensure target dentry is invalidated, despite
890 rehashing bug in vfs_rename_dir */ 895 rehashing bug in vfs_rename_dir */
891 new_dentry->d_time = jiffies; 896 ceph_invalidate_dentry_lease(new_dentry);
892 ceph_dentry(new_dentry)->lease_shared_gen = 0;
893 } 897 }
894 ceph_mdsc_put_request(req); 898 ceph_mdsc_put_request(req);
895 return err; 899 return err;
896} 900}
897 901
902/*
903 * Ensure a dentry lease will no longer revalidate.
904 */
905void ceph_invalidate_dentry_lease(struct dentry *dentry)
906{
907 spin_lock(&dentry->d_lock);
908 dentry->d_time = jiffies;
909 ceph_dentry(dentry)->lease_shared_gen = 0;
910 spin_unlock(&dentry->d_lock);
911}
898 912
899/* 913/*
900 * Check if dentry lease is valid. If not, delete the lease. Try to 914 * Check if dentry lease is valid. If not, delete the lease. Try to
@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
972{ 986{
973 struct inode *dir = dentry->d_parent->d_inode; 987 struct inode *dir = dentry->d_parent->d_inode;
974 988
975 dout("d_revalidate %p '%.*s' inode %p\n", dentry, 989 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
976 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 990 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
991 ceph_dentry(dentry)->offset);
977 992
978 /* always trust cached snapped dentries, snapdir dentry */ 993 /* always trust cached snapped dentries, snapdir dentry */
979 if (ceph_snap(dir) != CEPH_NOSNAP) { 994 if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1050 struct ceph_inode_info *ci = ceph_inode(inode); 1065 struct ceph_inode_info *ci = ceph_inode(inode);
1051 int left; 1066 int left;
1052 1067
1053 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) 1068 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1054 return -EISDIR; 1069 return -EISDIR;
1055 1070
1056 if (!cf->dir_info) { 1071 if (!cf->dir_info) {
@@ -1092,10 +1107,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1092 * an fsync() on a dir will wait for any uncommitted directory 1107 * an fsync() on a dir will wait for any uncommitted directory
1093 * operations to commit. 1108 * operations to commit.
1094 */ 1109 */
1095static int ceph_dir_fsync(struct file *file, struct dentry *dentry, 1110static int ceph_dir_fsync(struct file *file, int datasync)
1096 int datasync)
1097{ 1111{
1098 struct inode *inode = dentry->d_inode; 1112 struct inode *inode = file->f_path.dentry->d_inode;
1099 struct ceph_inode_info *ci = ceph_inode(inode); 1113 struct ceph_inode_info *ci = ceph_inode(inode);
1100 struct list_head *head = &ci->i_unsafe_dirops; 1114 struct list_head *head = &ci->i_unsafe_dirops;
1101 struct ceph_mds_request *req; 1115 struct ceph_mds_request *req;
@@ -1152,7 +1166,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1152 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1166 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1153 dn->d_name.len, dn->d_name.name); 1167 dn->d_name.len, dn->d_name.name);
1154 if (di) { 1168 if (di) {
1155 mdsc = &ceph_client(dn->d_sb)->mdsc; 1169 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1156 spin_lock(&mdsc->dentry_lru_lock); 1170 spin_lock(&mdsc->dentry_lru_lock);
1157 list_add_tail(&di->lru, &mdsc->dentry_lru); 1171 list_add_tail(&di->lru, &mdsc->dentry_lru);
1158 mdsc->num_dentry++; 1172 mdsc->num_dentry++;
@@ -1165,10 +1179,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1165 struct ceph_dentry_info *di = ceph_dentry(dn); 1179 struct ceph_dentry_info *di = ceph_dentry(dn);
1166 struct ceph_mds_client *mdsc; 1180 struct ceph_mds_client *mdsc;
1167 1181
1168 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, 1182 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1169 dn->d_name.len, dn->d_name.name); 1183 dn->d_name.len, dn->d_name.name, di->offset);
1170 if (di) { 1184 if (di) {
1171 mdsc = &ceph_client(dn->d_sb)->mdsc; 1185 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1172 spin_lock(&mdsc->dentry_lru_lock); 1186 spin_lock(&mdsc->dentry_lru_lock);
1173 list_move_tail(&di->lru, &mdsc->dentry_lru); 1187 list_move_tail(&di->lru, &mdsc->dentry_lru);
1174 spin_unlock(&mdsc->dentry_lru_lock); 1188 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1197,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1183 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1197 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1184 dn->d_name.len, dn->d_name.name); 1198 dn->d_name.len, dn->d_name.name);
1185 if (di) { 1199 if (di) {
1186 mdsc = &ceph_client(dn->d_sb)->mdsc; 1200 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1187 spin_lock(&mdsc->dentry_lru_lock); 1201 spin_lock(&mdsc->dentry_lru_lock);
1188 list_del_init(&di->lru); 1202 list_del_init(&di->lru);
1189 mdsc->num_dentry--; 1203 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb328..4480cb1c63e7 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
93 return ERR_PTR(-ESTALE); 93 return ERR_PTR(-ESTALE);
94 94
95 dentry = d_obtain_alias(inode); 95 dentry = d_obtain_alias(inode);
96 if (!dentry) { 96 if (IS_ERR(dentry)) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", 97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode); 98 fh->ino, inode);
99 iput(inode); 99 iput(inode);
100 return ERR_PTR(-ENOMEM); 100 return dentry;
101 } 101 }
102 err = ceph_init_dentry(dentry); 102 err = ceph_init_dentry(dentry);
103 103
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
115static struct dentry *__cfh_to_dentry(struct super_block *sb, 115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh) 116 struct ceph_nfs_confh *cfh)
117{ 117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; 118 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
119 struct inode *inode; 119 struct inode *inode;
120 struct dentry *dentry; 120 struct dentry *dentry;
121 struct ceph_vino vino; 121 struct ceph_vino vino;
@@ -133,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, 133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS); 134 USE_ANY_MDS);
135 if (IS_ERR(req)) 135 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req)); 136 return ERR_CAST(req);
137 137
138 req->r_ino1 = vino; 138 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino; 139 req->r_ino2.ino = cfh->parent_ino;
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
149 } 149 }
150 150
151 dentry = d_obtain_alias(inode); 151 dentry = d_obtain_alias(inode);
152 if (!dentry) { 152 if (IS_ERR(dentry)) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", 153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode); 154 cfh->ino, inode);
155 iput(inode); 155 iput(inode);
156 return ERR_PTR(-ENOMEM); 156 return dentry;
157 } 157 }
158 err = ceph_init_dentry(dentry); 158 err = ceph_init_dentry(dentry);
159 if (err < 0) { 159 if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
202 return ERR_PTR(-ESTALE); 202 return ERR_PTR(-ESTALE);
203 203
204 dentry = d_obtain_alias(inode); 204 dentry = d_obtain_alias(inode);
205 if (!dentry) { 205 if (IS_ERR(dentry)) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", 206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode); 207 cfh->ino, inode);
208 iput(inode); 208 iput(inode);
209 return ERR_PTR(-ENOMEM); 209 return dentry;
210 } 210 }
211 err = ceph_init_dentry(dentry); 211 err = ceph_init_dentry(dentry);
212 if (err < 0) { 212 if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7d634938edc9..6251a1574b94 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
230 /* do the open */ 230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode); 231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req)) 232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req)); 233 return ERR_CAST(req);
234 req->r_dentry = dget(dentry); 234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2; 235 req->r_num_caps = 2;
236 if (flags & O_CREAT) { 236 if (flags & O_CREAT) {
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
317/* 317/*
318 * allocate a vector new pages 318 * allocate a vector new pages
319 */ 319 */
320static struct page **alloc_page_vector(int num_pages) 320struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{ 321{
322 struct page **pages; 322 struct page **pages;
323 int i; 323 int i;
324 324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); 325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages) 326 if (!pages)
327 return ERR_PTR(-ENOMEM); 327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) { 328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS); 329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) { 330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i); 331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM); 332 return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
540 * in sequence. 540 * in sequence.
541 */ 541 */
542 } else { 542 } else {
543 pages = alloc_page_vector(num_pages); 543 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
544 } 544 }
545 if (IS_ERR(pages)) 545 if (IS_ERR(pages))
546 return PTR_ERR(pages); 546 return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
649 do_sync, 649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size, 650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2); 651 &mtime, false, 2);
652 if (IS_ERR(req)) 652 if (!req)
653 return PTR_ERR(req); 653 return -ENOMEM;
654 654
655 num_pages = calc_pages_for(pos, len); 655 num_pages = calc_pages_for(pos, len);
656 656
@@ -668,7 +668,7 @@ more:
668 truncate_inode_pages_range(inode->i_mapping, pos, 668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1)); 669 (pos+len) | (PAGE_CACHE_SIZE-1));
670 } else { 670 } else {
671 pages = alloc_page_vector(num_pages); 671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
672 if (IS_ERR(pages)) { 672 if (IS_ERR(pages)) {
673 ret = PTR_ERR(pages); 673 ret = PTR_ERR(pages);
674 goto out; 674 goto out;
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
809 struct file *file = iocb->ki_filp; 809 struct file *file = iocb->ki_filp;
810 struct inode *inode = file->f_dentry->d_inode; 810 struct inode *inode = file->f_dentry->d_inode;
811 struct ceph_inode_info *ci = ceph_inode(inode); 811 struct ceph_inode_info *ci = ceph_inode(inode);
812 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 812 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
813 loff_t endoff = pos + iov->iov_len; 813 loff_t endoff = pos + iov->iov_len;
814 int got = 0; 814 int got = 0;
815 int ret, err; 815 int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeba..226f5a50d362 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
69 69
70 BUG_ON(!S_ISDIR(parent->i_mode)); 70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode)) 71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode)); 72 return inode;
73 inode->i_mode = parent->i_mode; 73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid; 74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid; 75 inode->i_gid = parent->i_gid;
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 384 */
385 if (ci->i_snap_realm) { 385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc; 387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 389
390 dout(" dropping residual ref to snap realm %p\n", realm); 390 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
619 memcpy(ci->i_xattrs.blob->vec.iov_base, 619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len); 620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 xattr_blob = NULL;
622 } 623 }
623 624
624 inode->i_mapping->a_ops = &ceph_aops; 625 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info = 626 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info; 627 &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
627 628
628 switch (inode->i_mode & S_IFMT) { 629 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO: 630 case S_IFIFO:
@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
674 /* set dir completion flag? */ 675 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 && 676 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP && 677 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { 678 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
679 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
678 dout(" marking %p complete (empty)\n", inode); 680 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE; 681 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2; 682 ci->i_max_offset = 2;
681 } 683 }
682 684
683 /* it may be better to set st_size in getattr instead? */ 685 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) 686 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes; 687 inode->i_size = ci->i_rbytes;
686 break; 688 break;
687 default: 689 default:
@@ -802,6 +804,37 @@ out_unlock:
802} 804}
803 805
804/* 806/*
807 * Set dentry's directory position based on the current dir's max, and
808 * order it in d_subdirs, so that dcache_readdir behaves.
809 */
810static void ceph_set_dentry_offset(struct dentry *dn)
811{
812 struct dentry *dir = dn->d_parent;
813 struct inode *inode = dn->d_parent->d_inode;
814 struct ceph_dentry_info *di;
815
816 BUG_ON(!inode);
817
818 di = ceph_dentry(dn);
819
820 spin_lock(&inode->i_lock);
821 if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
822 spin_unlock(&inode->i_lock);
823 return;
824 }
825 di->offset = ceph_inode(inode)->i_max_offset++;
826 spin_unlock(&inode->i_lock);
827
828 spin_lock(&dcache_lock);
829 spin_lock(&dn->d_lock);
830 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
831 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
832 dn->d_u.d_child.prev, dn->d_u.d_child.next);
833 spin_unlock(&dn->d_lock);
834 spin_unlock(&dcache_lock);
835}
836
837/*
805 * splice a dentry to an inode. 838 * splice a dentry to an inode.
806 * caller must hold directory i_mutex for this to be safe. 839 * caller must hold directory i_mutex for this to be safe.
807 * 840 *
@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
814{ 847{
815 struct dentry *realdn; 848 struct dentry *realdn;
816 849
850 BUG_ON(dn->d_inode);
851
817 /* dn must be unhashed */ 852 /* dn must be unhashed */
818 if (!d_unhashed(dn)) 853 if (!d_unhashed(dn))
819 d_drop(dn); 854 d_drop(dn);
@@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
835 dn = realdn; 870 dn = realdn;
836 } else { 871 } else {
837 BUG_ON(!ceph_dentry(dn)); 872 BUG_ON(!ceph_dentry(dn));
838
839 dout("dn %p attached to %p ino %llx.%llx\n", 873 dout("dn %p attached to %p ino %llx.%llx\n",
840 dn, dn->d_inode, ceph_vinop(dn->d_inode)); 874 dn, dn->d_inode, ceph_vinop(dn->d_inode));
841 } 875 }
842 if ((!prehash || *prehash) && d_unhashed(dn)) 876 if ((!prehash || *prehash) && d_unhashed(dn))
843 d_rehash(dn); 877 d_rehash(dn);
878 ceph_set_dentry_offset(dn);
844out: 879out:
845 return dn; 880 return dn;
846} 881}
847 882
848/* 883/*
849 * Set dentry's directory position based on the current dir's max, and
850 * order it in d_subdirs, so that dcache_readdir behaves.
851 */
852static void ceph_set_dentry_offset(struct dentry *dn)
853{
854 struct dentry *dir = dn->d_parent;
855 struct inode *inode = dn->d_parent->d_inode;
856 struct ceph_dentry_info *di;
857
858 BUG_ON(!inode);
859
860 di = ceph_dentry(dn);
861
862 spin_lock(&inode->i_lock);
863 di->offset = ceph_inode(inode)->i_max_offset++;
864 spin_unlock(&inode->i_lock);
865
866 spin_lock(&dcache_lock);
867 spin_lock(&dn->d_lock);
868 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
869 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
870 dn->d_u.d_child.prev, dn->d_u.d_child.next);
871 spin_unlock(&dn->d_lock);
872 spin_unlock(&dcache_lock);
873}
874
875/*
876 * Incorporate results into the local cache. This is either just 884 * Incorporate results into the local cache. This is either just
877 * one inode, or a directory, dentry, and possibly linked-to inode (e.g., 885 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
878 * after a lookup). 886 * after a lookup).
@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
933 941
934 if (!rinfo->head->is_target && !rinfo->head->is_dentry) { 942 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
935 dout("fill_trace reply is empty!\n"); 943 dout("fill_trace reply is empty!\n");
936 if (rinfo->head->result == 0 && req->r_locked_dir) { 944 if (rinfo->head->result == 0 && req->r_locked_dir)
937 struct ceph_inode_info *ci = 945 ceph_invalidate_dir_request(req);
938 ceph_inode(req->r_locked_dir);
939 dout(" clearing %p complete (empty trace)\n",
940 req->r_locked_dir);
941 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
942 ci->i_release_count++;
943 }
944 return 0; 946 return 0;
945 } 947 }
946 948
@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1011 req->r_old_dentry->d_name.len, 1013 req->r_old_dentry->d_name.len,
1012 req->r_old_dentry->d_name.name, 1014 req->r_old_dentry->d_name.name,
1013 dn, dn->d_name.len, dn->d_name.name); 1015 dn, dn->d_name.len, dn->d_name.name);
1016
1014 /* ensure target dentry is invalidated, despite 1017 /* ensure target dentry is invalidated, despite
1015 rehashing bug in vfs_rename_dir */ 1018 rehashing bug in vfs_rename_dir */
1016 dn->d_time = jiffies; 1019 ceph_invalidate_dentry_lease(dn);
1017 ceph_dentry(dn)->lease_shared_gen = 0; 1020
1018 /* take overwritten dentry's readdir offset */ 1021 /* take overwritten dentry's readdir offset */
1022 dout("dn %p gets %p offset %lld (old offset %lld)\n",
1023 req->r_old_dentry, dn, ceph_dentry(dn)->offset,
1024 ceph_dentry(req->r_old_dentry)->offset);
1019 ceph_dentry(req->r_old_dentry)->offset = 1025 ceph_dentry(req->r_old_dentry)->offset =
1020 ceph_dentry(dn)->offset; 1026 ceph_dentry(dn)->offset;
1027
1021 dn = req->r_old_dentry; /* use old_dentry */ 1028 dn = req->r_old_dentry; /* use old_dentry */
1022 in = dn->d_inode; 1029 in = dn->d_inode;
1023 } 1030 }
@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1059 goto done; 1066 goto done;
1060 } 1067 }
1061 req->r_dentry = dn; /* may have spliced */ 1068 req->r_dentry = dn; /* may have spliced */
1062 ceph_set_dentry_offset(dn);
1063 igrab(in); 1069 igrab(in);
1064 } else if (ceph_ino(in) == vino.ino && 1070 } else if (ceph_ino(in) == vino.ino &&
1065 ceph_snap(in) == vino.snap) { 1071 ceph_snap(in) == vino.snap) {
@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1102 err = PTR_ERR(dn); 1108 err = PTR_ERR(dn);
1103 goto done; 1109 goto done;
1104 } 1110 }
1105 ceph_set_dentry_offset(dn);
1106 req->r_dentry = dn; /* may have spliced */ 1111 req->r_dentry = dn; /* may have spliced */
1107 igrab(in); 1112 igrab(in);
1108 rinfo->head->is_dentry = 1; /* fool notrace handlers */ 1113 rinfo->head->is_dentry = 1; /* fool notrace handlers */
@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1429{ 1434{
1430 struct ceph_inode_info *ci = ceph_inode(inode); 1435 struct ceph_inode_info *ci = ceph_inode(inode);
1431 1436
1432 if (queue_work(ceph_client(inode->i_sb)->trunc_wq, 1437 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1433 &ci->i_vmtruncate_work)) { 1438 &ci->i_vmtruncate_work)) {
1434 dout("ceph_queue_vmtruncate %p\n", inode); 1439 dout("ceph_queue_vmtruncate %p\n", inode);
1435 igrab(inode); 1440 igrab(inode);
@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1518 struct inode *parent_inode = dentry->d_parent->d_inode; 1523 struct inode *parent_inode = dentry->d_parent->d_inode;
1519 const unsigned int ia_valid = attr->ia_valid; 1524 const unsigned int ia_valid = attr->ia_valid;
1520 struct ceph_mds_request *req; 1525 struct ceph_mds_request *req;
1521 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; 1526 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
1522 int issued; 1527 int issued;
1523 int release = 0, dirtied = 0; 1528 int release = 0, dirtied = 0;
1524 int mask = 0; 1529 int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..d085f07756b4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
102 u64 len = 1, olen; 102 u64 len = 1, olen;
103 u64 tmp; 103 u64 tmp;
104 struct ceph_object_layout ol; 104 struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e01..b49f12822cbc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
40static void __wake_requests(struct ceph_mds_client *mdsc, 40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head); 41 struct list_head *head);
42 42
43const static struct ceph_connection_operations mds_con_ops; 43static const struct ceph_connection_operations mds_con_ops;
44 44
45 45
46/* 46/*
@@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
665 struct ceph_msg *msg; 665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h; 666 struct ceph_mds_session_head *h;
667 667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); 668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
669 if (IS_ERR(msg)) { 669 if (!msg) {
670 pr_err("create_session_msg ENOMEM creating msg\n"); 670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg)); 671 return NULL;
672 } 672 }
673 h = msg->front.iov_base; 673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op); 674 h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
687 struct ceph_msg *msg; 687 struct ceph_msg *msg;
688 int mstate; 688 int mstate;
689 int mds = session->s_mds; 689 int mds = session->s_mds;
690 int err = 0;
691 690
692 /* wait for mds to go active? */ 691 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); 692 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
698 697
699 /* send connect message */ 698 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); 699 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) { 700 if (!msg)
702 err = PTR_ERR(msg); 701 return -ENOMEM;
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg); 702 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0; 703 return 0;
709} 704}
710 705
@@ -804,12 +799,49 @@ out:
804} 799}
805 800
806static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, 801static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
807 void *arg) 802 void *arg)
808{ 803{
809 struct ceph_inode_info *ci = ceph_inode(inode); 804 struct ceph_inode_info *ci = ceph_inode(inode);
805 int drop = 0;
806
810 dout("removing cap %p, ci is %p, inode is %p\n", 807 dout("removing cap %p, ci is %p, inode is %p\n",
811 cap, ci, &ci->vfs_inode); 808 cap, ci, &ci->vfs_inode);
812 ceph_remove_cap(cap); 809 spin_lock(&inode->i_lock);
810 __ceph_remove_cap(cap);
811 if (!__ceph_is_any_real_caps(ci)) {
812 struct ceph_mds_client *mdsc =
813 &ceph_sb_to_client(inode->i_sb)->mdsc;
814
815 spin_lock(&mdsc->cap_dirty_lock);
816 if (!list_empty(&ci->i_dirty_item)) {
817 pr_info(" dropping dirty %s state for %p %lld\n",
818 ceph_cap_string(ci->i_dirty_caps),
819 inode, ceph_ino(inode));
820 ci->i_dirty_caps = 0;
821 list_del_init(&ci->i_dirty_item);
822 drop = 1;
823 }
824 if (!list_empty(&ci->i_flushing_item)) {
825 pr_info(" dropping dirty+flushing %s state for %p %lld\n",
826 ceph_cap_string(ci->i_flushing_caps),
827 inode, ceph_ino(inode));
828 ci->i_flushing_caps = 0;
829 list_del_init(&ci->i_flushing_item);
830 mdsc->num_cap_flushing--;
831 drop = 1;
832 }
833 if (drop && ci->i_wrbuffer_ref) {
834 pr_info(" dropping dirty data for %p %lld\n",
835 inode, ceph_ino(inode));
836 ci->i_wrbuffer_ref = 0;
837 ci->i_wrbuffer_ref_head = 0;
838 drop++;
839 }
840 spin_unlock(&mdsc->cap_dirty_lock);
841 }
842 spin_unlock(&inode->i_lock);
843 while (drop--)
844 iput(inode);
813 return 0; 845 return 0;
814} 846}
815 847
@@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
821 dout("remove_session_caps on %p\n", session); 853 dout("remove_session_caps on %p\n", session);
822 iterate_session_caps(session, remove_session_caps_cb, NULL); 854 iterate_session_caps(session, remove_session_caps_cb, NULL);
823 BUG_ON(session->s_nr_caps > 0); 855 BUG_ON(session->s_nr_caps > 0);
856 BUG_ON(!list_empty(&session->s_cap_flushing));
824 cleanup_cap_releases(session); 857 cleanup_cap_releases(session);
825} 858}
826 859
@@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
883 ceph_mds_state_name(state)); 916 ceph_mds_state_name(state));
884 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, 917 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
885 ++session->s_renew_seq); 918 ++session->s_renew_seq);
886 if (IS_ERR(msg)) 919 if (!msg)
887 return PTR_ERR(msg); 920 return -ENOMEM;
888 ceph_con_send(&session->s_con, msg); 921 ceph_con_send(&session->s_con, msg);
889 return 0; 922 return 0;
890} 923}
@@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
931 struct ceph_mds_session *session) 964 struct ceph_mds_session *session)
932{ 965{
933 struct ceph_msg *msg; 966 struct ceph_msg *msg;
934 int err = 0;
935 967
936 dout("request_close_session mds%d state %s seq %lld\n", 968 dout("request_close_session mds%d state %s seq %lld\n",
937 session->s_mds, session_state_name(session->s_state), 969 session->s_mds, session_state_name(session->s_state),
938 session->s_seq); 970 session->s_seq);
939 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); 971 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
940 if (IS_ERR(msg)) 972 if (!msg)
941 err = PTR_ERR(msg); 973 return -ENOMEM;
942 else 974 ceph_con_send(&session->s_con, msg);
943 ceph_con_send(&session->s_con, msg); 975 return 0;
944 return err;
945} 976}
946 977
947/* 978/*
@@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
1059 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1090 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1060 spin_unlock(&session->s_cap_lock); 1091 spin_unlock(&session->s_cap_lock);
1061 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1092 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1062 0, 0, NULL); 1093 GFP_NOFS);
1063 if (!msg) 1094 if (!msg)
1064 goto out_unlocked; 1095 goto out_unlocked;
1065 dout("add_cap_releases %p msg %p now %d\n", session, msg, 1096 dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1151 struct ceph_msg *msg; 1182 struct ceph_msg *msg;
1152 1183
1153 dout("send_cap_releases mds%d\n", session->s_mds); 1184 dout("send_cap_releases mds%d\n", session->s_mds);
1154 while (1) { 1185 spin_lock(&session->s_cap_lock);
1155 spin_lock(&session->s_cap_lock); 1186 while (!list_empty(&session->s_cap_releases_done)) {
1156 if (list_empty(&session->s_cap_releases_done))
1157 break;
1158 msg = list_first_entry(&session->s_cap_releases_done, 1187 msg = list_first_entry(&session->s_cap_releases_done,
1159 struct ceph_msg, list_head); 1188 struct ceph_msg, list_head);
1160 list_del_init(&msg->list_head); 1189 list_del_init(&msg->list_head);
@@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1162 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1191 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1163 dout("send_cap_releases mds%d %p\n", session->s_mds, msg); 1192 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1164 ceph_con_send(&session->s_con, msg); 1193 ceph_con_send(&session->s_con, msg);
1194 spin_lock(&session->s_cap_lock);
1165 } 1195 }
1166 spin_unlock(&session->s_cap_lock); 1196 spin_unlock(&session->s_cap_lock);
1167} 1197}
1168 1198
1199static void discard_cap_releases(struct ceph_mds_client *mdsc,
1200 struct ceph_mds_session *session)
1201{
1202 struct ceph_msg *msg;
1203 struct ceph_mds_cap_release *head;
1204 unsigned num;
1205
1206 dout("discard_cap_releases mds%d\n", session->s_mds);
1207 spin_lock(&session->s_cap_lock);
1208
1209 /* zero out the in-progress message */
1210 msg = list_first_entry(&session->s_cap_releases,
1211 struct ceph_msg, list_head);
1212 head = msg->front.iov_base;
1213 num = le32_to_cpu(head->num);
1214 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1215 head->num = cpu_to_le32(0);
1216 session->s_num_cap_releases += num;
1217
1218 /* requeue completed messages */
1219 while (!list_empty(&session->s_cap_releases_done)) {
1220 msg = list_first_entry(&session->s_cap_releases_done,
1221 struct ceph_msg, list_head);
1222 list_del_init(&msg->list_head);
1223
1224 head = msg->front.iov_base;
1225 num = le32_to_cpu(head->num);
1226 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
1227 num);
1228 session->s_num_cap_releases += num;
1229 head->num = cpu_to_le32(0);
1230 msg->front.iov_len = sizeof(*head);
1231 list_add(&msg->list_head, &session->s_cap_releases);
1232 }
1233
1234 spin_unlock(&session->s_cap_lock);
1235}
1236
1169/* 1237/*
1170 * requests 1238 * requests
1171 */ 1239 */
@@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1181 if (!req) 1249 if (!req)
1182 return ERR_PTR(-ENOMEM); 1250 return ERR_PTR(-ENOMEM);
1183 1251
1252 mutex_init(&req->r_fill_mutex);
1184 req->r_started = jiffies; 1253 req->r_started = jiffies;
1185 req->r_resend_mds = -1; 1254 req->r_resend_mds = -1;
1186 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1255 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1320,7 @@ retry:
1251 len += 1 + temp->d_name.len; 1320 len += 1 + temp->d_name.len;
1252 temp = temp->d_parent; 1321 temp = temp->d_parent;
1253 if (temp == NULL) { 1322 if (temp == NULL) {
1254 pr_err("build_path_dentry corrupt dentry %p\n", dentry); 1323 pr_err("build_path corrupt dentry %p\n", dentry);
1255 return ERR_PTR(-EINVAL); 1324 return ERR_PTR(-EINVAL);
1256 } 1325 }
1257 } 1326 }
@@ -1267,7 +1336,7 @@ retry:
1267 struct inode *inode = temp->d_inode; 1336 struct inode *inode = temp->d_inode;
1268 1337
1269 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { 1338 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1270 dout("build_path_dentry path+%d: %p SNAPDIR\n", 1339 dout("build_path path+%d: %p SNAPDIR\n",
1271 pos, temp); 1340 pos, temp);
1272 } else if (stop_on_nosnap && inode && 1341 } else if (stop_on_nosnap && inode &&
1273 ceph_snap(inode) == CEPH_NOSNAP) { 1342 ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1347,18 @@ retry:
1278 break; 1347 break;
1279 strncpy(path + pos, temp->d_name.name, 1348 strncpy(path + pos, temp->d_name.name,
1280 temp->d_name.len); 1349 temp->d_name.len);
1281 dout("build_path_dentry path+%d: %p '%.*s'\n",
1282 pos, temp, temp->d_name.len, path + pos);
1283 } 1350 }
1284 if (pos) 1351 if (pos)
1285 path[--pos] = '/'; 1352 path[--pos] = '/';
1286 temp = temp->d_parent; 1353 temp = temp->d_parent;
1287 if (temp == NULL) { 1354 if (temp == NULL) {
1288 pr_err("build_path_dentry corrupt dentry\n"); 1355 pr_err("build_path corrupt dentry\n");
1289 kfree(path); 1356 kfree(path);
1290 return ERR_PTR(-EINVAL); 1357 return ERR_PTR(-EINVAL);
1291 } 1358 }
1292 } 1359 }
1293 if (pos != 0) { 1360 if (pos != 0) {
1294 pr_err("build_path_dentry did not end path lookup where " 1361 pr_err("build_path did not end path lookup where "
1295 "expected, namelen is %d, pos is %d\n", len, pos); 1362 "expected, namelen is %d, pos is %d\n", len, pos);
1296 /* presumably this is only possible if racing with a 1363 /* presumably this is only possible if racing with a
1297 rename of one of the parent directories (we can not 1364 rename of one of the parent directories (we can not
@@ -1303,7 +1370,7 @@ retry:
1303 1370
1304 *base = ceph_ino(temp->d_inode); 1371 *base = ceph_ino(temp->d_inode);
1305 *plen = len; 1372 *plen = len;
1306 dout("build_path_dentry on %p %d built %llx '%.*s'\n", 1373 dout("build_path on %p %d built %llx '%.*s'\n",
1307 dentry, atomic_read(&dentry->d_count), *base, len, path); 1374 dentry, atomic_read(&dentry->d_count), *base, len, path);
1308 return path; 1375 return path;
1309} 1376}
@@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1426 if (req->r_old_dentry_drop) 1493 if (req->r_old_dentry_drop)
1427 len += req->r_old_dentry->d_name.len; 1494 len += req->r_old_dentry->d_name.len;
1428 1495
1429 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); 1496 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
1430 if (IS_ERR(msg)) 1497 if (!msg) {
1498 msg = ERR_PTR(-ENOMEM);
1431 goto out_free2; 1499 goto out_free2;
1500 }
1432 1501
1433 msg->hdr.tid = cpu_to_le64(req->r_tid); 1502 msg->hdr.tid = cpu_to_le64(req->r_tid);
1434 1503
@@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1517 } 1586 }
1518 msg = create_request_message(mdsc, req, mds); 1587 msg = create_request_message(mdsc, req, mds);
1519 if (IS_ERR(msg)) { 1588 if (IS_ERR(msg)) {
1520 req->r_reply = ERR_PTR(PTR_ERR(msg)); 1589 req->r_err = PTR_ERR(msg);
1521 complete_request(mdsc, req); 1590 complete_request(mdsc, req);
1522 return -PTR_ERR(msg); 1591 return PTR_ERR(msg);
1523 } 1592 }
1524 req->r_request = msg; 1593 req->r_request = msg;
1525 1594
@@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
1552 int mds = -1; 1621 int mds = -1;
1553 int err = -EAGAIN; 1622 int err = -EAGAIN;
1554 1623
1555 if (req->r_reply) 1624 if (req->r_err || req->r_got_result)
1556 goto out; 1625 goto out;
1557 1626
1558 if (req->r_timeout && 1627 if (req->r_timeout &&
@@ -1609,7 +1678,7 @@ out:
1609 return err; 1678 return err;
1610 1679
1611finish: 1680finish:
1612 req->r_reply = ERR_PTR(err); 1681 req->r_err = err;
1613 complete_request(mdsc, req); 1682 complete_request(mdsc, req);
1614 goto out; 1683 goto out;
1615} 1684}
@@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
1630 1699
1631/* 1700/*
1632 * Wake up threads with requests pending for @mds, so that they can 1701 * Wake up threads with requests pending for @mds, so that they can
1633 * resubmit their requests to a possibly different mds. If @all is set, 1702 * resubmit their requests to a possibly different mds.
1634 * wake up if their requests has been forwarded to @mds, too.
1635 */ 1703 */
1636static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) 1704static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1637{ 1705{
1638 struct ceph_mds_request *req; 1706 struct ceph_mds_request *req;
1639 struct rb_node *p; 1707 struct rb_node *p;
@@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1689 __register_request(mdsc, req, dir); 1757 __register_request(mdsc, req, dir);
1690 __do_request(mdsc, req); 1758 __do_request(mdsc, req);
1691 1759
1692 /* wait */ 1760 if (req->r_err) {
1693 if (!req->r_reply) { 1761 err = req->r_err;
1694 mutex_unlock(&mdsc->mutex); 1762 __unregister_request(mdsc, req);
1695 if (req->r_timeout) { 1763 dout("do_request early error %d\n", err);
1696 err = (long)wait_for_completion_interruptible_timeout( 1764 goto out;
1697 &req->r_completion, req->r_timeout);
1698 if (err == 0)
1699 req->r_reply = ERR_PTR(-EIO);
1700 else if (err < 0)
1701 req->r_reply = ERR_PTR(err);
1702 } else {
1703 err = wait_for_completion_interruptible(
1704 &req->r_completion);
1705 if (err)
1706 req->r_reply = ERR_PTR(err);
1707 }
1708 mutex_lock(&mdsc->mutex);
1709 } 1765 }
1710 1766
1711 if (IS_ERR(req->r_reply)) { 1767 /* wait */
1712 err = PTR_ERR(req->r_reply); 1768 mutex_unlock(&mdsc->mutex);
1713 req->r_reply = NULL; 1769 dout("do_request waiting\n");
1770 if (req->r_timeout) {
1771 err = (long)wait_for_completion_killable_timeout(
1772 &req->r_completion, req->r_timeout);
1773 if (err == 0)
1774 err = -EIO;
1775 } else {
1776 err = wait_for_completion_killable(&req->r_completion);
1777 }
1778 dout("do_request waited, got %d\n", err);
1779 mutex_lock(&mdsc->mutex);
1714 1780
1715 if (err == -ERESTARTSYS) { 1781 /* only abort if we didn't race with a real reply */
1716 /* aborted */ 1782 if (req->r_got_result) {
1717 req->r_aborted = true; 1783 err = le32_to_cpu(req->r_reply_info.head->result);
1784 } else if (err < 0) {
1785 dout("aborted request %lld with %d\n", req->r_tid, err);
1718 1786
1719 if (req->r_locked_dir && 1787 /*
1720 (req->r_op & CEPH_MDS_OP_WRITE)) { 1788 * ensure we aren't running concurrently with
1721 struct ceph_inode_info *ci = 1789 * ceph_fill_trace or ceph_readdir_prepopulate, which
1722 ceph_inode(req->r_locked_dir); 1790 * rely on locks (dir mutex) held by our caller.
1791 */
1792 mutex_lock(&req->r_fill_mutex);
1793 req->r_err = err;
1794 req->r_aborted = true;
1795 mutex_unlock(&req->r_fill_mutex);
1723 1796
1724 dout("aborted, clearing I_COMPLETE on %p\n", 1797 if (req->r_locked_dir &&
1725 req->r_locked_dir); 1798 (req->r_op & CEPH_MDS_OP_WRITE))
1726 spin_lock(&req->r_locked_dir->i_lock); 1799 ceph_invalidate_dir_request(req);
1727 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1728 ci->i_release_count++;
1729 spin_unlock(&req->r_locked_dir->i_lock);
1730 }
1731 } else {
1732 /* clean up this request */
1733 __unregister_request(mdsc, req);
1734 if (!list_empty(&req->r_unsafe_item))
1735 list_del_init(&req->r_unsafe_item);
1736 complete(&req->r_safe_completion);
1737 }
1738 } else if (req->r_err) {
1739 err = req->r_err;
1740 } else { 1800 } else {
1741 err = le32_to_cpu(req->r_reply_info.head->result); 1801 err = req->r_err;
1742 } 1802 }
1743 mutex_unlock(&mdsc->mutex);
1744 1803
1804out:
1805 mutex_unlock(&mdsc->mutex);
1745 dout("do_request %p done, result %d\n", req, err); 1806 dout("do_request %p done, result %d\n", req, err);
1746 return err; 1807 return err;
1747} 1808}
1748 1809
1749/* 1810/*
1811 * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
1812 * namespace request.
1813 */
1814void ceph_invalidate_dir_request(struct ceph_mds_request *req)
1815{
1816 struct inode *inode = req->r_locked_dir;
1817 struct ceph_inode_info *ci = ceph_inode(inode);
1818
1819 dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
1820 spin_lock(&inode->i_lock);
1821 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1822 ci->i_release_count++;
1823 spin_unlock(&inode->i_lock);
1824
1825 if (req->r_dentry)
1826 ceph_invalidate_dentry_lease(req->r_dentry);
1827 if (req->r_old_dentry)
1828 ceph_invalidate_dentry_lease(req->r_old_dentry);
1829}
1830
1831/*
1750 * Handle mds reply. 1832 * Handle mds reply.
1751 * 1833 *
1752 * We take the session mutex and parse and process the reply immediately. 1834 * We take the session mutex and parse and process the reply immediately.
@@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1797 mutex_unlock(&mdsc->mutex); 1879 mutex_unlock(&mdsc->mutex);
1798 goto out; 1880 goto out;
1799 } 1881 }
1882 if (req->r_got_safe && !head->safe) {
1883 pr_warning("got unsafe after safe on %llu from mds%d\n",
1884 tid, mds);
1885 mutex_unlock(&mdsc->mutex);
1886 goto out;
1887 }
1800 1888
1801 result = le32_to_cpu(head->result); 1889 result = le32_to_cpu(head->result);
1802 1890
@@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1838 mutex_unlock(&mdsc->mutex); 1926 mutex_unlock(&mdsc->mutex);
1839 goto out; 1927 goto out;
1840 } 1928 }
1841 } 1929 } else {
1842
1843 BUG_ON(req->r_reply);
1844
1845 if (!head->safe) {
1846 req->r_got_unsafe = true; 1930 req->r_got_unsafe = true;
1847 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); 1931 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1848 } 1932 }
@@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1871 } 1955 }
1872 1956
1873 /* insert trace into our cache */ 1957 /* insert trace into our cache */
1958 mutex_lock(&req->r_fill_mutex);
1874 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 1959 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1875 if (err == 0) { 1960 if (err == 0) {
1876 if (result == 0 && rinfo->dir_nr) 1961 if (result == 0 && rinfo->dir_nr)
1877 ceph_readdir_prepopulate(req, req->r_session); 1962 ceph_readdir_prepopulate(req, req->r_session);
1878 ceph_unreserve_caps(&req->r_caps_reservation); 1963 ceph_unreserve_caps(&req->r_caps_reservation);
1879 } 1964 }
1965 mutex_unlock(&req->r_fill_mutex);
1880 1966
1881 up_read(&mdsc->snap_rwsem); 1967 up_read(&mdsc->snap_rwsem);
1882out_err: 1968out_err:
1883 if (err) { 1969 mutex_lock(&mdsc->mutex);
1884 req->r_err = err; 1970 if (!req->r_aborted) {
1971 if (err) {
1972 req->r_err = err;
1973 } else {
1974 req->r_reply = msg;
1975 ceph_msg_get(msg);
1976 req->r_got_result = true;
1977 }
1885 } else { 1978 } else {
1886 req->r_reply = msg; 1979 dout("reply arrived after request %lld was aborted\n", tid);
1887 ceph_msg_get(msg);
1888 } 1980 }
1981 mutex_unlock(&mdsc->mutex);
1889 1982
1890 add_cap_releases(mdsc, req->r_session, -1); 1983 add_cap_releases(mdsc, req->r_session, -1);
1891 mutex_unlock(&session->s_mutex); 1984 mutex_unlock(&session->s_mutex);
@@ -1921,16 +2014,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
1921 mutex_lock(&mdsc->mutex); 2014 mutex_lock(&mdsc->mutex);
1922 req = __lookup_request(mdsc, tid); 2015 req = __lookup_request(mdsc, tid);
1923 if (!req) { 2016 if (!req) {
1924 dout("forward %llu to mds%d - req dne\n", tid, next_mds); 2017 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
1925 goto out; /* dup reply? */ 2018 goto out; /* dup reply? */
1926 } 2019 }
1927 2020
1928 if (fwd_seq <= req->r_num_fwd) { 2021 if (req->r_aborted) {
1929 dout("forward %llu to mds%d - old seq %d <= %d\n", 2022 dout("forward tid %llu aborted, unregistering\n", tid);
2023 __unregister_request(mdsc, req);
2024 } else if (fwd_seq <= req->r_num_fwd) {
2025 dout("forward tid %llu to mds%d - old seq %d <= %d\n",
1930 tid, next_mds, req->r_num_fwd, fwd_seq); 2026 tid, next_mds, req->r_num_fwd, fwd_seq);
1931 } else { 2027 } else {
1932 /* resend. forward race not possible; mds would drop */ 2028 /* resend. forward race not possible; mds would drop */
1933 dout("forward %llu to mds%d (we resend)\n", tid, next_mds); 2029 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
2030 BUG_ON(req->r_err);
2031 BUG_ON(req->r_got_result);
1934 req->r_num_fwd = fwd_seq; 2032 req->r_num_fwd = fwd_seq;
1935 req->r_resend_mds = next_mds; 2033 req->r_resend_mds = next_mds;
1936 put_request_session(req); 2034 put_request_session(req);
@@ -1984,6 +2082,8 @@ static void handle_session(struct ceph_mds_session *session,
1984 2082
1985 switch (op) { 2083 switch (op) {
1986 case CEPH_SESSION_OPEN: 2084 case CEPH_SESSION_OPEN:
2085 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2086 pr_info("mds%d reconnect success\n", session->s_mds);
1987 session->s_state = CEPH_MDS_SESSION_OPEN; 2087 session->s_state = CEPH_MDS_SESSION_OPEN;
1988 renewed_caps(mdsc, session, 0); 2088 renewed_caps(mdsc, session, 0);
1989 wake = 1; 2089 wake = 1;
@@ -1997,10 +2097,12 @@ static void handle_session(struct ceph_mds_session *session,
1997 break; 2097 break;
1998 2098
1999 case CEPH_SESSION_CLOSE: 2099 case CEPH_SESSION_CLOSE:
2100 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2101 pr_info("mds%d reconnect denied\n", session->s_mds);
2000 remove_session_caps(session); 2102 remove_session_caps(session);
2001 wake = 1; /* for good measure */ 2103 wake = 1; /* for good measure */
2002 complete(&mdsc->session_close_waiters); 2104 complete(&mdsc->session_close_waiters);
2003 kick_requests(mdsc, mds, 0); /* cur only */ 2105 kick_requests(mdsc, mds);
2004 break; 2106 break;
2005 2107
2006 case CEPH_SESSION_STALE: 2108 case CEPH_SESSION_STALE:
@@ -2132,54 +2234,44 @@ out:
2132 * 2234 *
2133 * called with mdsc->mutex held. 2235 * called with mdsc->mutex held.
2134 */ 2236 */
2135static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) 2237static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2238 struct ceph_mds_session *session)
2136{ 2239{
2137 struct ceph_mds_session *session = NULL;
2138 struct ceph_msg *reply; 2240 struct ceph_msg *reply;
2139 struct rb_node *p; 2241 struct rb_node *p;
2242 int mds = session->s_mds;
2140 int err = -ENOMEM; 2243 int err = -ENOMEM;
2141 struct ceph_pagelist *pagelist; 2244 struct ceph_pagelist *pagelist;
2142 2245
2143 pr_info("reconnect to recovering mds%d\n", mds); 2246 pr_info("mds%d reconnect start\n", mds);
2144 2247
2145 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 2248 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2146 if (!pagelist) 2249 if (!pagelist)
2147 goto fail_nopagelist; 2250 goto fail_nopagelist;
2148 ceph_pagelist_init(pagelist); 2251 ceph_pagelist_init(pagelist);
2149 2252
2150 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); 2253 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
2151 if (IS_ERR(reply)) { 2254 if (!reply)
2152 err = PTR_ERR(reply);
2153 goto fail_nomsg; 2255 goto fail_nomsg;
2154 }
2155
2156 /* find session */
2157 session = __ceph_lookup_mds_session(mdsc, mds);
2158 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2159 2256
2160 if (session) { 2257 mutex_lock(&session->s_mutex);
2161 mutex_lock(&session->s_mutex); 2258 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2259 session->s_seq = 0;
2162 2260
2163 session->s_state = CEPH_MDS_SESSION_RECONNECTING; 2261 ceph_con_open(&session->s_con,
2164 session->s_seq = 0; 2262 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2165 2263
2166 ceph_con_open(&session->s_con, 2264 /* replay unsafe requests */
2167 ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); 2265 replay_unsafe_requests(mdsc, session);
2168
2169 /* replay unsafe requests */
2170 replay_unsafe_requests(mdsc, session);
2171 } else {
2172 dout("no session for mds%d, will send short reconnect\n",
2173 mds);
2174 }
2175 2266
2176 down_read(&mdsc->snap_rwsem); 2267 down_read(&mdsc->snap_rwsem);
2177 2268
2178 if (!session)
2179 goto send;
2180 dout("session %p state %s\n", session, 2269 dout("session %p state %s\n", session,
2181 session_state_name(session->s_state)); 2270 session_state_name(session->s_state));
2182 2271
2272 /* drop old cap expires; we're about to reestablish that state */
2273 discard_cap_releases(mdsc, session);
2274
2183 /* traverse this session's caps */ 2275 /* traverse this session's caps */
2184 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2276 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2185 if (err) 2277 if (err)
@@ -2208,36 +2300,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2208 goto fail; 2300 goto fail;
2209 } 2301 }
2210 2302
2211send:
2212 reply->pagelist = pagelist; 2303 reply->pagelist = pagelist;
2213 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2304 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2214 reply->nr_pages = calc_pages_for(0, pagelist->length); 2305 reply->nr_pages = calc_pages_for(0, pagelist->length);
2215 ceph_con_send(&session->s_con, reply); 2306 ceph_con_send(&session->s_con, reply);
2216 2307
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 mutex_unlock(&session->s_mutex); 2308 mutex_unlock(&session->s_mutex);
2219 2309
2220 mutex_lock(&mdsc->mutex); 2310 mutex_lock(&mdsc->mutex);
2221 __wake_requests(mdsc, &session->s_waiting); 2311 __wake_requests(mdsc, &session->s_waiting);
2222 mutex_unlock(&mdsc->mutex); 2312 mutex_unlock(&mdsc->mutex);
2223 2313
2224 ceph_put_mds_session(session);
2225
2226 up_read(&mdsc->snap_rwsem); 2314 up_read(&mdsc->snap_rwsem);
2227 mutex_lock(&mdsc->mutex);
2228 return; 2315 return;
2229 2316
2230fail: 2317fail:
2231 ceph_msg_put(reply); 2318 ceph_msg_put(reply);
2232 up_read(&mdsc->snap_rwsem); 2319 up_read(&mdsc->snap_rwsem);
2233 mutex_unlock(&session->s_mutex); 2320 mutex_unlock(&session->s_mutex);
2234 ceph_put_mds_session(session);
2235fail_nomsg: 2321fail_nomsg:
2236 ceph_pagelist_release(pagelist); 2322 ceph_pagelist_release(pagelist);
2237 kfree(pagelist); 2323 kfree(pagelist);
2238fail_nopagelist: 2324fail_nopagelist:
2239 pr_err("error %d preparing reconnect for mds%d\n", err, mds); 2325 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2240 mutex_lock(&mdsc->mutex);
2241 return; 2326 return;
2242} 2327}
2243 2328
@@ -2290,7 +2375,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2290 } 2375 }
2291 2376
2292 /* kick any requests waiting on the recovering mds */ 2377 /* kick any requests waiting on the recovering mds */
2293 kick_requests(mdsc, i, 1); 2378 kick_requests(mdsc, i);
2294 } else if (oldstate == newstate) { 2379 } else if (oldstate == newstate) {
2295 continue; /* nothing new with this mds */ 2380 continue; /* nothing new with this mds */
2296 } 2381 }
@@ -2299,22 +2384,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2299 * send reconnect? 2384 * send reconnect?
2300 */ 2385 */
2301 if (s->s_state == CEPH_MDS_SESSION_RESTARTING && 2386 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2302 newstate >= CEPH_MDS_STATE_RECONNECT) 2387 newstate >= CEPH_MDS_STATE_RECONNECT) {
2303 send_mds_reconnect(mdsc, i); 2388 mutex_unlock(&mdsc->mutex);
2389 send_mds_reconnect(mdsc, s);
2390 mutex_lock(&mdsc->mutex);
2391 }
2304 2392
2305 /* 2393 /*
2306 * kick requests on any mds that has gone active. 2394 * kick request on any mds that has gone active.
2307 *
2308 * kick requests on cur or forwarder: we may have sent
2309 * the request to mds1, mds1 told us it forwarded it
2310 * to mds2, but then we learn mds1 failed and can't be
2311 * sure it successfully forwarded our request before
2312 * it died.
2313 */ 2395 */
2314 if (oldstate < CEPH_MDS_STATE_ACTIVE && 2396 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2315 newstate >= CEPH_MDS_STATE_ACTIVE) { 2397 newstate >= CEPH_MDS_STATE_ACTIVE) {
2316 pr_info("mds%d reconnect completed\n", s->s_mds); 2398 if (oldstate != CEPH_MDS_STATE_CREATING &&
2317 kick_requests(mdsc, i, 1); 2399 oldstate != CEPH_MDS_STATE_STARTING)
2400 pr_info("mds%d recovery completed\n", s->s_mds);
2401 kick_requests(mdsc, i);
2318 ceph_kick_flushing_caps(mdsc, s); 2402 ceph_kick_flushing_caps(mdsc, s);
2319 wake_up_session_caps(s, 1); 2403 wake_up_session_caps(s, 1);
2320 } 2404 }
@@ -2457,12 +2541,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2457 dnamelen = dentry->d_name.len; 2541 dnamelen = dentry->d_name.len;
2458 len += dnamelen; 2542 len += dnamelen;
2459 2543
2460 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); 2544 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
2461 if (IS_ERR(msg)) 2545 if (!msg)
2462 return; 2546 return;
2463 lease = msg->front.iov_base; 2547 lease = msg->front.iov_base;
2464 lease->action = action; 2548 lease->action = action;
2465 lease->mask = cpu_to_le16(CEPH_LOCK_DN); 2549 lease->mask = cpu_to_le16(1);
2466 lease->ino = cpu_to_le64(ceph_vino(inode).ino); 2550 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2467 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); 2551 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2468 lease->seq = cpu_to_le32(seq); 2552 lease->seq = cpu_to_le32(seq);
@@ -2492,7 +2576,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2492 2576
2493 BUG_ON(inode == NULL); 2577 BUG_ON(inode == NULL);
2494 BUG_ON(dentry == NULL); 2578 BUG_ON(dentry == NULL);
2495 BUG_ON(mask != CEPH_LOCK_DN); 2579 BUG_ON(mask == 0);
2496 2580
2497 /* is dentry lease valid? */ 2581 /* is dentry lease valid? */
2498 spin_lock(&dentry->d_lock); 2582 spin_lock(&dentry->d_lock);
@@ -2603,7 +2687,9 @@ static void delayed_work(struct work_struct *work)
2603 else 2687 else
2604 ceph_con_keepalive(&s->s_con); 2688 ceph_con_keepalive(&s->s_con);
2605 add_cap_releases(mdsc, s, -1); 2689 add_cap_releases(mdsc, s, -1);
2606 send_cap_releases(mdsc, s); 2690 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2691 s->s_state == CEPH_MDS_SESSION_HUNG)
2692 send_cap_releases(mdsc, s);
2607 mutex_unlock(&s->s_mutex); 2693 mutex_unlock(&s->s_mutex);
2608 ceph_put_mds_session(s); 2694 ceph_put_mds_session(s);
2609 2695
@@ -2620,6 +2706,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2620 mdsc->client = client; 2706 mdsc->client = client;
2621 mutex_init(&mdsc->mutex); 2707 mutex_init(&mdsc->mutex);
2622 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2708 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2709 if (mdsc->mdsmap == NULL)
2710 return -ENOMEM;
2711
2623 init_completion(&mdsc->safe_umount_waiters); 2712 init_completion(&mdsc->safe_umount_waiters);
2624 init_completion(&mdsc->session_close_waiters); 2713 init_completion(&mdsc->session_close_waiters);
2625 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2714 INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2645,6 +2734,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2645 init_waitqueue_head(&mdsc->cap_flushing_wq); 2734 init_waitqueue_head(&mdsc->cap_flushing_wq);
2646 spin_lock_init(&mdsc->dentry_lru_lock); 2735 spin_lock_init(&mdsc->dentry_lru_lock);
2647 INIT_LIST_HEAD(&mdsc->dentry_lru); 2736 INIT_LIST_HEAD(&mdsc->dentry_lru);
2737
2648 return 0; 2738 return 0;
2649} 2739}
2650 2740
@@ -2740,6 +2830,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2740{ 2830{
2741 u64 want_tid, want_flush; 2831 u64 want_tid, want_flush;
2742 2832
2833 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
2834 return;
2835
2743 dout("sync\n"); 2836 dout("sync\n");
2744 mutex_lock(&mdsc->mutex); 2837 mutex_lock(&mdsc->mutex);
2745 want_tid = mdsc->last_tid; 2838 want_tid = mdsc->last_tid;
@@ -2922,9 +3015,10 @@ static void con_put(struct ceph_connection *con)
2922static void peer_reset(struct ceph_connection *con) 3015static void peer_reset(struct ceph_connection *con)
2923{ 3016{
2924 struct ceph_mds_session *s = con->private; 3017 struct ceph_mds_session *s = con->private;
3018 struct ceph_mds_client *mdsc = s->s_mdsc;
2925 3019
2926 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", 3020 pr_warning("mds%d closed our session\n", s->s_mds);
2927 s->s_mds); 3021 send_mds_reconnect(mdsc, s);
2928} 3022}
2929 3023
2930static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 3024static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3125,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
3031 return ceph_monc_validate_auth(&mdsc->client->monc); 3125 return ceph_monc_validate_auth(&mdsc->client->monc);
3032} 3126}
3033 3127
3034const static struct ceph_connection_operations mds_con_ops = { 3128static const struct ceph_connection_operations mds_con_ops = {
3035 .get = con_get, 3129 .get = con_get,
3036 .put = con_put, 3130 .put = con_put,
3037 .dispatch = dispatch, 3131 .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..d9936c4f1212 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ 165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */ 166 struct inode *r_target_inode; /* resulting inode */
167 167
168 struct mutex r_fill_mutex;
169
168 union ceph_mds_request_args r_args; 170 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */ 171 int r_fmode; /* file mode, if expecting cap */
170 172
@@ -213,7 +215,7 @@ struct ceph_mds_request {
213 struct completion r_safe_completion; 215 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback; 216 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */ 217 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe; 218 bool r_got_unsafe, r_got_safe, r_got_result;
217 219
218 bool r_did_prepopulate; 220 bool r_did_prepopulate;
219 u32 r_readdir_offset; 221 u32 r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode, 303 struct inode *inode,
302 struct dentry *dn, int mask); 304 struct dentry *dn, int mask);
303 305
306extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
307
304extern struct ceph_mds_request * 308extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 309ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 310extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491a..64b8b1f7863d 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *); 39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con); 40static void ceph_fault(struct ceph_connection *con);
41 41
42const char *ceph_name_type_str(int t)
43{
44 switch (t) {
45 case CEPH_ENTITY_TYPE_MON: return "mon";
46 case CEPH_ENTITY_TYPE_MDS: return "mds";
47 case CEPH_ENTITY_TYPE_OSD: return "osd";
48 case CEPH_ENTITY_TYPE_CLIENT: return "client";
49 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
50 default: return "???";
51 }
52}
53
54/* 42/*
55 * nicely render a sockaddr as a string. 43 * nicely render a sockaddr as a string.
56 */ 44 */
@@ -132,6 +120,12 @@ void ceph_msgr_exit(void)
132 destroy_workqueue(ceph_msgr_wq); 120 destroy_workqueue(ceph_msgr_wq);
133} 121}
134 122
123void ceph_msgr_flush()
124{
125 flush_workqueue(ceph_msgr_wq);
126}
127
128
135/* 129/*
136 * socket callback functions 130 * socket callback functions
137 */ 131 */
@@ -340,6 +334,7 @@ static void reset_connection(struct ceph_connection *con)
340 ceph_msg_put(con->out_msg); 334 ceph_msg_put(con->out_msg);
341 con->out_msg = NULL; 335 con->out_msg = NULL;
342 } 336 }
337 con->out_keepalive_pending = false;
343 con->in_seq = 0; 338 con->in_seq = 0;
344 con->in_seq_acked = 0; 339 con->in_seq_acked = 0;
345} 340}
@@ -357,6 +352,7 @@ void ceph_con_close(struct ceph_connection *con)
357 clear_bit(WRITE_PENDING, &con->state); 352 clear_bit(WRITE_PENDING, &con->state);
358 mutex_lock(&con->mutex); 353 mutex_lock(&con->mutex);
359 reset_connection(con); 354 reset_connection(con);
355 con->peer_global_seq = 0;
360 cancel_delayed_work(&con->work); 356 cancel_delayed_work(&con->work);
361 mutex_unlock(&con->mutex); 357 mutex_unlock(&con->mutex);
362 queue_con(con); 358 queue_con(con);
@@ -661,7 +657,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
661 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 657 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
662 con->connect_seq, global_seq, proto); 658 con->connect_seq, global_seq, proto);
663 659
664 con->out_connect.features = CEPH_FEATURE_SUPPORTED; 660 con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
665 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 661 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
666 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 662 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
667 con->out_connect.global_seq = cpu_to_le32(global_seq); 663 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1124,8 +1120,8 @@ static void fail_protocol(struct ceph_connection *con)
1124 1120
1125static int process_connect(struct ceph_connection *con) 1121static int process_connect(struct ceph_connection *con)
1126{ 1122{
1127 u64 sup_feat = CEPH_FEATURE_SUPPORTED; 1123 u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
1128 u64 req_feat = CEPH_FEATURE_REQUIRED; 1124 u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
1129 u64 server_feat = le64_to_cpu(con->in_reply.features); 1125 u64 server_feat = le64_to_cpu(con->in_reply.features);
1130 1126
1131 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 1127 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1233,6 +1229,7 @@ static int process_connect(struct ceph_connection *con)
1233 clear_bit(CONNECTING, &con->state); 1229 clear_bit(CONNECTING, &con->state);
1234 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 1230 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1235 con->connect_seq++; 1231 con->connect_seq++;
1232 con->peer_features = server_feat;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n", 1233 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq, 1234 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq), 1235 le32_to_cpu(con->in_reply.connect_seq),
@@ -1402,19 +1399,17 @@ static int read_partial_message(struct ceph_connection *con)
1402 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); 1399 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1403 if (skip) { 1400 if (skip) {
1404 /* skip this message */ 1401 /* skip this message */
1405 dout("alloc_msg returned NULL, skipping message\n"); 1402 dout("alloc_msg said skip message\n");
1406 con->in_base_pos = -front_len - middle_len - data_len - 1403 con->in_base_pos = -front_len - middle_len - data_len -
1407 sizeof(m->footer); 1404 sizeof(m->footer);
1408 con->in_tag = CEPH_MSGR_TAG_READY; 1405 con->in_tag = CEPH_MSGR_TAG_READY;
1409 con->in_seq++; 1406 con->in_seq++;
1410 return 0; 1407 return 0;
1411 } 1408 }
1412 if (IS_ERR(con->in_msg)) { 1409 if (!con->in_msg) {
1413 ret = PTR_ERR(con->in_msg);
1414 con->in_msg = NULL;
1415 con->error_msg = 1410 con->error_msg =
1416 "error allocating memory for incoming message"; 1411 "error allocating memory for incoming message";
1417 return ret; 1412 return -ENOMEM;
1418 } 1413 }
1419 m = con->in_msg; 1414 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */ 1415 m->front.iov_len = 0; /* haven't read it yet */
@@ -1514,14 +1509,14 @@ static void process_message(struct ceph_connection *con)
1514 1509
1515 /* if first message, set peer_name */ 1510 /* if first message, set peer_name */
1516 if (con->peer_name.type == 0) 1511 if (con->peer_name.type == 0)
1517 con->peer_name = msg->hdr.src.name; 1512 con->peer_name = msg->hdr.src;
1518 1513
1519 con->in_seq++; 1514 con->in_seq++;
1520 mutex_unlock(&con->mutex); 1515 mutex_unlock(&con->mutex);
1521 1516
1522 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", 1517 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1523 msg, le64_to_cpu(msg->hdr.seq), 1518 msg, le64_to_cpu(msg->hdr.seq),
1524 ENTITY_NAME(msg->hdr.src.name), 1519 ENTITY_NAME(msg->hdr.src),
1525 le16_to_cpu(msg->hdr.type), 1520 le16_to_cpu(msg->hdr.type),
1526 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), 1521 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1527 le32_to_cpu(msg->hdr.front_len), 1522 le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1541,6 @@ static int try_write(struct ceph_connection *con)
1546 dout("try_write start %p state %lu nref %d\n", con, con->state, 1541 dout("try_write start %p state %lu nref %d\n", con, con->state,
1547 atomic_read(&con->nref)); 1542 atomic_read(&con->nref));
1548 1543
1549 mutex_lock(&con->mutex);
1550more: 1544more:
1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 1545 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1552 1546
@@ -1639,7 +1633,6 @@ do_next:
1639done: 1633done:
1640 ret = 0; 1634 ret = 0;
1641out: 1635out:
1642 mutex_unlock(&con->mutex);
1643 dout("try_write done on %p\n", con); 1636 dout("try_write done on %p\n", con);
1644 return ret; 1637 return ret;
1645} 1638}
@@ -1651,7 +1644,6 @@ out:
1651 */ 1644 */
1652static int try_read(struct ceph_connection *con) 1645static int try_read(struct ceph_connection *con)
1653{ 1646{
1654 struct ceph_messenger *msgr;
1655 int ret = -1; 1647 int ret = -1;
1656 1648
1657 if (!con->sock) 1649 if (!con->sock)
@@ -1661,9 +1653,6 @@ static int try_read(struct ceph_connection *con)
1661 return 0; 1653 return 0;
1662 1654
1663 dout("try_read start on %p\n", con); 1655 dout("try_read start on %p\n", con);
1664 msgr = con->msgr;
1665
1666 mutex_lock(&con->mutex);
1667 1656
1668more: 1657more:
1669 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, 1658 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1747,6 @@ more:
1758done: 1747done:
1759 ret = 0; 1748 ret = 0;
1760out: 1749out:
1761 mutex_unlock(&con->mutex);
1762 dout("try_read done on %p\n", con); 1750 dout("try_read done on %p\n", con);
1763 return ret; 1751 return ret;
1764 1752
@@ -1830,6 +1818,8 @@ more:
1830 dout("con_work %p start, clearing QUEUED\n", con); 1818 dout("con_work %p start, clearing QUEUED\n", con);
1831 clear_bit(QUEUED, &con->state); 1819 clear_bit(QUEUED, &con->state);
1832 1820
1821 mutex_lock(&con->mutex);
1822
1833 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ 1823 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1834 dout("con_work CLOSED\n"); 1824 dout("con_work CLOSED\n");
1835 con_close_socket(con); 1825 con_close_socket(con);
@@ -1844,11 +1834,16 @@ more:
1844 if (test_and_clear_bit(SOCK_CLOSED, &con->state) || 1834 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1845 try_read(con) < 0 || 1835 try_read(con) < 0 ||
1846 try_write(con) < 0) { 1836 try_write(con) < 0) {
1837 mutex_unlock(&con->mutex);
1847 backoff = 1; 1838 backoff = 1;
1848 ceph_fault(con); /* error/fault path */ 1839 ceph_fault(con); /* error/fault path */
1840 goto done_unlocked;
1849 } 1841 }
1850 1842
1851done: 1843done:
1844 mutex_unlock(&con->mutex);
1845
1846done_unlocked:
1852 clear_bit(BUSY, &con->state); 1847 clear_bit(BUSY, &con->state);
1853 dout("con->state=%lu\n", con->state); 1848 dout("con->state=%lu\n", con->state);
1854 if (test_bit(QUEUED, &con->state)) { 1849 if (test_bit(QUEUED, &con->state)) {
@@ -1947,7 +1942,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1947 1942
1948 /* the zero page is needed if a request is "canceled" while the message 1943 /* the zero page is needed if a request is "canceled" while the message
1949 * is being written over the socket */ 1944 * is being written over the socket */
1950 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1945 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1951 if (!msgr->zero_page) { 1946 if (!msgr->zero_page) {
1952 kfree(msgr); 1947 kfree(msgr);
1953 return ERR_PTR(-ENOMEM); 1948 return ERR_PTR(-ENOMEM);
@@ -1987,9 +1982,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1987 } 1982 }
1988 1983
1989 /* set src+dst */ 1984 /* set src+dst */
1990 msg->hdr.src.name = con->msgr->inst.name; 1985 msg->hdr.src = con->msgr->inst.name;
1991 msg->hdr.src.addr = con->msgr->my_enc_addr;
1992 msg->hdr.orig_src = msg->hdr.src;
1993 1986
1994 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); 1987 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1995 1988
@@ -2083,12 +2076,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
2083 * construct a new message with given type, size 2076 * construct a new message with given type, size
2084 * the new msg has a ref count of 1. 2077 * the new msg has a ref count of 1.
2085 */ 2078 */
2086struct ceph_msg *ceph_msg_new(int type, int front_len, 2079struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2087 int page_len, int page_off, struct page **pages)
2088{ 2080{
2089 struct ceph_msg *m; 2081 struct ceph_msg *m;
2090 2082
2091 m = kmalloc(sizeof(*m), GFP_NOFS); 2083 m = kmalloc(sizeof(*m), flags);
2092 if (m == NULL) 2084 if (m == NULL)
2093 goto out; 2085 goto out;
2094 kref_init(&m->kref); 2086 kref_init(&m->kref);
@@ -2100,8 +2092,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2100 m->hdr.version = 0; 2092 m->hdr.version = 0;
2101 m->hdr.front_len = cpu_to_le32(front_len); 2093 m->hdr.front_len = cpu_to_le32(front_len);
2102 m->hdr.middle_len = 0; 2094 m->hdr.middle_len = 0;
2103 m->hdr.data_len = cpu_to_le32(page_len); 2095 m->hdr.data_len = 0;
2104 m->hdr.data_off = cpu_to_le16(page_off); 2096 m->hdr.data_off = 0;
2105 m->hdr.reserved = 0; 2097 m->hdr.reserved = 0;
2106 m->footer.front_crc = 0; 2098 m->footer.front_crc = 0;
2107 m->footer.middle_crc = 0; 2099 m->footer.middle_crc = 0;
@@ -2115,11 +2107,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2115 /* front */ 2107 /* front */
2116 if (front_len) { 2108 if (front_len) {
2117 if (front_len > PAGE_CACHE_SIZE) { 2109 if (front_len > PAGE_CACHE_SIZE) {
2118 m->front.iov_base = __vmalloc(front_len, GFP_NOFS, 2110 m->front.iov_base = __vmalloc(front_len, flags,
2119 PAGE_KERNEL); 2111 PAGE_KERNEL);
2120 m->front_is_vmalloc = true; 2112 m->front_is_vmalloc = true;
2121 } else { 2113 } else {
2122 m->front.iov_base = kmalloc(front_len, GFP_NOFS); 2114 m->front.iov_base = kmalloc(front_len, flags);
2123 } 2115 }
2124 if (m->front.iov_base == NULL) { 2116 if (m->front.iov_base == NULL) {
2125 pr_err("msg_new can't allocate %d bytes\n", 2117 pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2127,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2135 m->middle = NULL; 2127 m->middle = NULL;
2136 2128
2137 /* data */ 2129 /* data */
2138 m->nr_pages = calc_pages_for(page_off, page_len); 2130 m->nr_pages = 0;
2139 m->pages = pages; 2131 m->pages = NULL;
2140 m->pagelist = NULL; 2132 m->pagelist = NULL;
2141 2133
2142 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, 2134 dout("ceph_msg_new %p front %d\n", m, front_len);
2143 m->nr_pages);
2144 return m; 2135 return m;
2145 2136
2146out2: 2137out2:
2147 ceph_msg_put(m); 2138 ceph_msg_put(m);
2148out: 2139out:
2149 pr_err("msg_new can't create type %d len %d\n", type, front_len); 2140 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2150 return ERR_PTR(-ENOMEM); 2141 return NULL;
2151} 2142}
2152 2143
2153/* 2144/*
@@ -2190,29 +2181,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2190 mutex_unlock(&con->mutex); 2181 mutex_unlock(&con->mutex);
2191 msg = con->ops->alloc_msg(con, hdr, skip); 2182 msg = con->ops->alloc_msg(con, hdr, skip);
2192 mutex_lock(&con->mutex); 2183 mutex_lock(&con->mutex);
2193 if (IS_ERR(msg)) 2184 if (!msg || *skip)
2194 return msg;
2195
2196 if (*skip)
2197 return NULL; 2185 return NULL;
2198 } 2186 }
2199 if (!msg) { 2187 if (!msg) {
2200 *skip = 0; 2188 *skip = 0;
2201 msg = ceph_msg_new(type, front_len, 0, 0, NULL); 2189 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2202 if (!msg) { 2190 if (!msg) {
2203 pr_err("unable to allocate msg type %d len %d\n", 2191 pr_err("unable to allocate msg type %d len %d\n",
2204 type, front_len); 2192 type, front_len);
2205 return ERR_PTR(-ENOMEM); 2193 return NULL;
2206 } 2194 }
2207 } 2195 }
2208 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 2196 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2209 2197
2210 if (middle_len) { 2198 if (middle_len && !msg->middle) {
2211 ret = ceph_alloc_middle(con, msg); 2199 ret = ceph_alloc_middle(con, msg);
2212
2213 if (ret < 0) { 2200 if (ret < 0) {
2214 ceph_msg_put(msg); 2201 ceph_msg_put(msg);
2215 return msg; 2202 return NULL;
2216 } 2203 }
2217 } 2204 }
2218 2205
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc971..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
49 int *skip); 49 int *skip);
50}; 50};
51 51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */ 52/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) 53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
56 54
57struct ceph_messenger { 55struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */ 56 struct ceph_entity_inst inst; /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
144 struct ceph_entity_addr peer_addr; /* peer address */ 142 struct ceph_entity_addr peer_addr; /* peer address */
145 struct ceph_entity_name peer_name; /* peer name */ 143 struct ceph_entity_name peer_name; /* peer name */
146 struct ceph_entity_addr peer_addr_for_me; 144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
147 u32 connect_seq; /* identify the most recent connection 146 u32 connect_seq; /* identify the most recent connection
148 attempt for this connection, client */ 147 attempt for this connection, client */
149 u32 peer_global_seq; /* peer's global seq for this connection */ 148 u32 peer_global_seq; /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
158 struct list_head out_queue; 157 struct list_head out_queue;
159 struct list_head out_sent; /* sending or sent but unacked */ 158 struct list_head out_sent; /* sending or sent but unacked */
160 u64 out_seq; /* last message queued for send */ 159 u64 out_seq; /* last message queued for send */
161 u64 out_seq_sent; /* last message sent */
162 bool out_keepalive_pending; 160 bool out_keepalive_pending;
163 161
164 u64 in_seq, in_seq_acked; /* last message received, acked */ 162 u64 in_seq, in_seq_acked; /* last message received, acked */
@@ -215,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
215 213
216extern int ceph_msgr_init(void); 214extern int ceph_msgr_init(void);
217extern void ceph_msgr_exit(void); 215extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void);
218 217
219extern struct ceph_messenger *ceph_messenger_create( 218extern struct ceph_messenger *ceph_messenger_create(
220 struct ceph_entity_addr *myaddr); 219 struct ceph_entity_addr *myaddr);
@@ -234,9 +233,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
234extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); 233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
235extern void ceph_con_put(struct ceph_connection *con); 234extern void ceph_con_put(struct ceph_connection *con);
236 235
237extern struct ceph_msg *ceph_msg_new(int type, int front_len, 236extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
238 int page_len, int page_off,
239 struct page **pages);
240extern void ceph_msg_kfree(struct ceph_msg *m); 237extern void ceph_msg_kfree(struct ceph_msg *m);
241 238
242 239
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca956..21c62e9b7d1d 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
28 * resend any outstanding requests. 28 * resend any outstanding requests.
29 */ 29 */
30 30
31const static struct ceph_connection_operations mon_con_ops; 31static const struct ceph_connection_operations mon_con_ops;
32 32
33static int __validate_auth(struct ceph_mon_client *monc); 33static int __validate_auth(struct ceph_mon_client *monc);
34 34
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
104 monc->pending_auth = 1; 104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len; 105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len); 106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
107 ceph_msg_get(monc->m_auth); /* keep our ref */ 108 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth); 109 ceph_con_send(monc->con, monc->m_auth);
109} 110}
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
187 monc->want_next_osdmap); 188 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) || 189 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) { 190 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg; 191 struct ceph_msg *msg = monc->m_subscribe;
191 struct ceph_mon_subscribe_item *i; 192 struct ceph_mon_subscribe_item *i;
192 void *p, *end; 193 void *p, *end;
193 194
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base; 195 p = msg->front.iov_base;
199 end = p + msg->front.iov_len; 196 end = p + msg->front_max;
200 197
201 dout("__send_subscribe to 'mdsmap' %u+\n", 198 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap); 199 (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
226 223
227 msg->front.iov_len = p - msg->front.iov_base; 224 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg); 226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
230 228
231 monc->sub_sent = jiffies | 1; /* never 0 */ 229 monc->sub_sent = jiffies | 1; /* never 0 */
232 } 230 }
@@ -353,14 +351,14 @@ out:
353/* 351/*
354 * statfs 352 * statfs
355 */ 353 */
356static struct ceph_mon_statfs_request *__lookup_statfs( 354static struct ceph_mon_generic_request *__lookup_generic_req(
357 struct ceph_mon_client *monc, u64 tid) 355 struct ceph_mon_client *monc, u64 tid)
358{ 356{
359 struct ceph_mon_statfs_request *req; 357 struct ceph_mon_generic_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node; 358 struct rb_node *n = monc->generic_request_tree.rb_node;
361 359
362 while (n) { 360 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node); 361 req = rb_entry(n, struct ceph_mon_generic_request, node);
364 if (tid < req->tid) 362 if (tid < req->tid)
365 n = n->rb_left; 363 n = n->rb_left;
366 else if (tid > req->tid) 364 else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
371 return NULL; 369 return NULL;
372} 370}
373 371
374static void __insert_statfs(struct ceph_mon_client *monc, 372static void __insert_generic_request(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new) 373 struct ceph_mon_generic_request *new)
376{ 374{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node; 375 struct rb_node **p = &monc->generic_request_tree.rb_node;
378 struct rb_node *parent = NULL; 376 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL; 377 struct ceph_mon_generic_request *req = NULL;
380 378
381 while (*p) { 379 while (*p) {
382 parent = *p; 380 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node); 381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
384 if (new->tid < req->tid) 382 if (new->tid < req->tid)
385 p = &(*p)->rb_left; 383 p = &(*p)->rb_left;
386 else if (new->tid > req->tid) 384 else if (new->tid > req->tid)
@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
390 } 388 }
391 389
392 rb_link_node(&new->node, parent, p); 390 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree); 391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403}
404
405static void put_generic_request(struct ceph_mon_generic_request *req)
406{
407 kref_put(&req->kref, release_generic_request);
408}
409
410static void get_generic_request(struct ceph_mon_generic_request *req)
411{
412 kref_get(&req->kref);
413}
414
415static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
416 struct ceph_msg_header *hdr,
417 int *skip)
418{
419 struct ceph_mon_client *monc = con->private;
420 struct ceph_mon_generic_request *req;
421 u64 tid = le64_to_cpu(hdr->tid);
422 struct ceph_msg *m;
423
424 mutex_lock(&monc->mutex);
425 req = __lookup_generic_req(monc, tid);
426 if (!req) {
427 dout("get_generic_reply %lld dne\n", tid);
428 *skip = 1;
429 m = NULL;
430 } else {
431 dout("get_generic_reply %lld got %p\n", tid, req->reply);
432 m = ceph_msg_get(req->reply);
433 /*
434 * we don't need to track the connection reading into
435 * this reply because we only have one open connection
436 * at a time, ever.
437 */
438 }
439 mutex_unlock(&monc->mutex);
440 return m;
394} 441}
395 442
396static void handle_statfs_reply(struct ceph_mon_client *monc, 443static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg) 444 struct ceph_msg *msg)
398{ 445{
399 struct ceph_mon_statfs_request *req; 446 struct ceph_mon_generic_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 447 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid; 448 u64 tid = le64_to_cpu(msg->hdr.tid);
402 449
403 if (msg->front.iov_len != sizeof(*reply)) 450 if (msg->front.iov_len != sizeof(*reply))
404 goto bad; 451 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid); 452 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407 453
408 mutex_lock(&monc->mutex); 454 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid); 455 req = __lookup_generic_req(monc, tid);
410 if (req) { 456 if (req) {
411 *req->buf = reply->st; 457 *(struct ceph_statfs *)req->buf = reply->st;
412 req->result = 0; 458 req->result = 0;
459 get_generic_request(req);
413 } 460 }
414 mutex_unlock(&monc->mutex); 461 mutex_unlock(&monc->mutex);
415 if (req) 462 if (req) {
416 complete(&req->completion); 463 complete(&req->completion);
464 put_generic_request(req);
465 }
417 return; 466 return;
418 467
419bad: 468bad:
420 pr_err("corrupt statfs reply, no tid\n"); 469 pr_err("corrupt generic reply, no tid\n");
421 ceph_msg_dump(msg); 470 ceph_msg_dump(msg);
422} 471}
423 472
424/* 473/*
425 * (re)send a statfs request 474 * Do a synchronous statfs().
426 */ 475 */
427static int send_statfs(struct ceph_mon_client *monc, 476int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
428 struct ceph_mon_statfs_request *req)
429{ 477{
430 struct ceph_msg *msg; 478 struct ceph_mon_generic_request *req;
431 struct ceph_mon_statfs *h; 479 struct ceph_mon_statfs *h;
480 int err;
432 481
433 dout("send_statfs tid %llu\n", req->tid); 482 req = kzalloc(sizeof(*req), GFP_NOFS);
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); 483 if (!req)
435 if (IS_ERR(msg)) 484 return -ENOMEM;
436 return PTR_ERR(msg); 485
437 req->request = msg; 486 kref_init(&req->kref);
438 msg->hdr.tid = cpu_to_le64(req->tid); 487 req->buf = buf;
439 h = msg->front.iov_base; 488 init_completion(&req->completion);
489
490 err = -ENOMEM;
491 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
492 if (!req->request)
493 goto out;
494 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
495 if (!req->reply)
496 goto out;
497
498 /* fill out request */
499 h = req->request->front.iov_base;
440 h->monhdr.have_version = 0; 500 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1); 501 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0; 502 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid; 503 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463 504
464 /* register request */ 505 /* register request */
465 mutex_lock(&monc->mutex); 506 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid; 507 req->tid = ++monc->last_tid;
467 req.last_attempt = jiffies; 508 req->request->hdr.tid = cpu_to_le64(req->tid);
468 req.delay = BASE_DELAY_INTERVAL; 509 __insert_generic_request(monc, req);
469 __insert_statfs(monc, &req); 510 monc->num_generic_requests++;
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex); 511 mutex_unlock(&monc->mutex);
472 512
473 /* send request and wait */ 513 /* send request and wait */
474 err = send_statfs(monc, &req); 514 ceph_con_send(monc->con, ceph_msg_get(req->request));
475 if (!err) 515 err = wait_for_completion_interruptible(&req->completion);
476 err = wait_for_completion_interruptible(&req.completion);
477 516
478 mutex_lock(&monc->mutex); 517 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree); 518 rb_erase(&req->node, &monc->generic_request_tree);
480 monc->num_statfs_requests--; 519 monc->num_generic_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex); 520 mutex_unlock(&monc->mutex);
483 521
484 if (!err) 522 if (!err)
485 err = req.result; 523 err = req->result;
524
525out:
526 kref_put(&req->kref, release_generic_request);
486 return err; 527 return err;
487} 528}
488 529
489/* 530/*
490 * Resend pending statfs requests. 531 * Resend pending statfs requests.
491 */ 532 */
492static void __resend_statfs(struct ceph_mon_client *monc) 533static void __resend_generic_request(struct ceph_mon_client *monc)
493{ 534{
494 struct ceph_mon_statfs_request *req; 535 struct ceph_mon_generic_request *req;
495 struct rb_node *p; 536 struct rb_node *p;
496 537
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { 538 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node); 539 req = rb_entry(p, struct ceph_mon_generic_request, node);
499 send_statfs(monc, req); 540 ceph_con_revoke(monc->con, req->request);
541 ceph_con_send(monc->con, ceph_msg_get(req->request));
500 } 542 }
501} 543}
502 544
@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 628 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; 629 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588 630
589 /* msg pools */ 631 /* msgs */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, 632 err = -ENOMEM;
591 sizeof(struct ceph_mon_subscribe_ack), 1, false); 633 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
592 if (err < 0) 634 sizeof(struct ceph_mon_subscribe_ack),
635 GFP_NOFS);
636 if (!monc->m_subscribe_ack)
593 goto out_monmap; 637 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply, 638
595 sizeof(struct ceph_mon_statfs_reply), 0, false); 639 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
596 if (err < 0) 640 if (!monc->m_subscribe)
597 goto out_pool1; 641 goto out_subscribe_ack;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); 642
599 if (err < 0) 643 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
600 goto out_pool2; 644 if (!monc->m_auth_reply)
601 645 goto out_subscribe;
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); 646
647 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
603 monc->pending_auth = 0; 648 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) { 649 if (!monc->m_auth)
605 err = PTR_ERR(monc->m_auth); 650 goto out_auth_reply;
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609 651
610 monc->cur_mon = -1; 652 monc->cur_mon = -1;
611 monc->hunting = true; 653 monc->hunting = true;
@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
613 monc->sub_sent = 0; 655 monc->sub_sent = 0;
614 656
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 657 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT; 658 monc->generic_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0; 659 monc->num_generic_requests = 0;
618 monc->last_tid = 0; 660 monc->last_tid = 0;
619 661
620 monc->have_mdsmap = 0; 662 monc->have_mdsmap = 0;
@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
622 monc->want_next_osdmap = 1; 664 monc->want_next_osdmap = 1;
623 return 0; 665 return 0;
624 666
625out_pool3: 667out_auth_reply:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 668 ceph_msg_put(monc->m_auth_reply);
627out_pool2: 669out_subscribe:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 670 ceph_msg_put(monc->m_subscribe);
629out_pool1: 671out_subscribe_ack:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 672 ceph_msg_put(monc->m_subscribe_ack);
631out_monmap: 673out_monmap:
632 kfree(monc->monmap); 674 kfree(monc->monmap);
633out: 675out:
@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
651 ceph_auth_destroy(monc->auth); 693 ceph_auth_destroy(monc->auth);
652 694
653 ceph_msg_put(monc->m_auth); 695 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 696 ceph_msg_put(monc->m_auth_reply);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 697 ceph_msg_put(monc->m_subscribe);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 698 ceph_msg_put(monc->m_subscribe_ack);
657 699
658 kfree(monc->monmap); 700 kfree(monc->monmap);
659} 701}
@@ -662,8 +704,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
662 struct ceph_msg *msg) 704 struct ceph_msg *msg)
663{ 705{
664 int ret; 706 int ret;
707 int was_auth = 0;
665 708
666 mutex_lock(&monc->mutex); 709 mutex_lock(&monc->mutex);
710 if (monc->auth->ops)
711 was_auth = monc->auth->ops->is_authenticated(monc->auth);
667 monc->pending_auth = 0; 712 monc->pending_auth = 0;
668 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 713 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
669 msg->front.iov_len, 714 msg->front.iov_len,
@@ -674,14 +719,14 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
674 wake_up(&monc->client->auth_wq); 719 wake_up(&monc->client->auth_wq);
675 } else if (ret > 0) { 720 } else if (ret > 0) {
676 __send_prepared_auth_request(monc, ret); 721 __send_prepared_auth_request(monc, ret);
677 } else if (monc->auth->ops->is_authenticated(monc->auth)) { 722 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
678 dout("authenticated, starting session\n"); 723 dout("authenticated, starting session\n");
679 724
680 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 725 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
681 monc->client->msgr->inst.name.num = monc->auth->global_id; 726 monc->client->msgr->inst.name.num = monc->auth->global_id;
682 727
683 __send_subscribe(monc); 728 __send_subscribe(monc);
684 __resend_statfs(monc); 729 __resend_generic_request(monc);
685 } 730 }
686 mutex_unlock(&monc->mutex); 731 mutex_unlock(&monc->mutex);
687} 732}
@@ -770,18 +815,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
770 815
771 switch (type) { 816 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK: 817 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); 818 m = ceph_msg_get(monc->m_subscribe_ack);
774 break; 819 break;
775 case CEPH_MSG_STATFS_REPLY: 820 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); 821 return get_generic_reply(con, hdr, skip);
777 break;
778 case CEPH_MSG_AUTH_REPLY: 822 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); 823 m = ceph_msg_get(monc->m_auth_reply);
780 break; 824 break;
781 case CEPH_MSG_MON_MAP: 825 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP: 826 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP: 827 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL); 828 m = ceph_msg_new(type, front_len, GFP_NOFS);
785 break; 829 break;
786 } 830 }
787 831
@@ -826,7 +870,7 @@ out:
826 mutex_unlock(&monc->mutex); 870 mutex_unlock(&monc->mutex);
827} 871}
828 872
829const static struct ceph_connection_operations mon_con_ops = { 873static const struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get, 874 .get = ceph_con_get,
831 .put = ceph_con_put, 875 .put = ceph_con_put,
832 .dispatch = dispatch, 876 .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..174d794321d0 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
2#define _FS_CEPH_MON_CLIENT_H 2#define _FS_CEPH_MON_CLIENT_H
3 3
4#include <linux/completion.h> 4#include <linux/completion.h>
5#include <linux/kref.h>
5#include <linux/rbtree.h> 6#include <linux/rbtree.h>
6 7
7#include "messenger.h" 8#include "messenger.h"
8#include "msgpool.h"
9 9
10struct ceph_client; 10struct ceph_client;
11struct ceph_mount_args; 11struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
22}; 22};
23 23
24struct ceph_mon_client; 24struct ceph_mon_client;
25struct ceph_mon_statfs_request; 25struct ceph_mon_generic_request;
26 26
27 27
28/* 28/*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
40}; 40};
41 41
42/* 42/*
43 * statfs() is done a bit differently because we need to get data back 43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
44 * to the caller 45 * to the caller
45 */ 46 */
46struct ceph_mon_statfs_request { 47struct ceph_mon_generic_request {
48 struct kref kref;
47 u64 tid; 49 u64 tid;
48 struct rb_node node; 50 struct rb_node node;
49 int result; 51 int result;
50 struct ceph_statfs *buf; 52 void *buf;
51 struct completion completion; 53 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */ 54 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */
54}; 56};
55 57
56struct ceph_mon_client { 58struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
61 struct delayed_work delayed_work; 63 struct delayed_work delayed_work;
62 64
63 struct ceph_auth_client *auth; 65 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth; 66 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
65 int pending_auth; 67 int pending_auth;
66 68
67 bool hunting; 69 bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
70 struct ceph_connection *con; 72 struct ceph_connection *con;
71 bool have_fsid; 73 bool have_fsid;
72 74
73 /* msg pools */ 75 /* pending generic requests */
74 struct ceph_msgpool msgpool_subscribe_ack; 76 struct rb_root generic_request_tree;
75 struct ceph_msgpool msgpool_statfs_reply; 77 int num_generic_requests;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid; 78 u64 last_tid;
82 79
83 /* mds/osd map */ 80 /* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
7 7
8#include "msgpool.h" 8#include "msgpool.h"
9 9
10/* 10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11 * We use msg pools to preallocate memory for messages we expect to 11{
12 * receive over the wire, to avoid getting ourselves into OOM 12 struct ceph_msgpool *pool = arg;
13 * conditions at unexpected times. We take use a few different 13 void *p;
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31 14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
32 20
33/* 21static void free_fn(void *element, void *arg)
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{ 22{
38 struct ceph_msg *msg; 23 ceph_msg_put(element);
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61} 24}
62 25
63int ceph_msgpool_init(struct ceph_msgpool *pool, 26int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking) 27 int front_len, int size, bool blocking, const char *name)
65{ 28{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len; 29 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs); 30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
72 pool->num = 0; 31 if (!pool->pool)
73 pool->min = min; 32 return -ENOMEM;
74 pool->blocking = blocking; 33 pool->name = name;
75 init_waitqueue_head(&pool->wait); 34 return 0;
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81} 35}
82 36
83void ceph_msgpool_destroy(struct ceph_msgpool *pool) 37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{ 38{
85 dout("msgpool_destroy %p\n", pool); 39 mempool_destroy(pool->pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90} 40}
91 41
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) 42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
93{ 44{
94 int ret; 45 if (front_len > pool->front_len) {
95 46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
96 spin_lock(&pool->lock); 47 pool->name, front_len, pool->front_len);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1); 48 WARN_ON(1);
113 49
114 /* try to alloc a fresh message */ 50 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL); 51 return ceph_msg_new(0, front_len, GFP_NOFS);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 } 52 }
129 53
130 while (1) { 54 return mempool_alloc(pool->pool, GFP_NOFS);
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163} 55}
164 56
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) 57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{ 58{
167 spin_lock(&pool->lock); 59 /* reset msg front_len; user may have changed it */
168 if (pool->num < pool->min) { 60 msg->front.iov_len = pool->front_len;
169 /* reset msg front_len; user may have changed it */ 61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172 62
173 kref_set(&msg->kref, 1); /* retake a single ref */ 63 kref_init(&msg->kref); /* retake single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186} 64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
1#ifndef _FS_CEPH_MSGPOOL 1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL 2#define _FS_CEPH_MSGPOOL
3 3
4#include <linux/mempool.h>
4#include "messenger.h" 5#include "messenger.h"
5 6
6/* 7/*
@@ -8,18 +9,15 @@
8 * avoid unexpected OOM conditions. 9 * avoid unexpected OOM conditions.
9 */ 10 */
10struct ceph_msgpool { 11struct ceph_msgpool {
11 spinlock_t lock; 12 const char *name;
13 mempool_t *pool;
12 int front_len; /* preallocated payload size */ 14 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17}; 15};
18 16
19extern int ceph_msgpool_init(struct ceph_msgpool *pool, 17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking); 18 int front_len, int size, bool blocking,
19 const char *name);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); 20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, 21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len); 22 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); 23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..892a0298dfdf 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
50#define CEPH_ENTITY_TYPE_MDS 0x02 50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04 51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08 52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20 53#define CEPH_ENTITY_TYPE_AUTH 0x20
55 54
56#define CEPH_ENTITY_TYPE_ANY 0xFF 55#define CEPH_ENTITY_TYPE_ANY 0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
120/* 119/*
121 * message header 120 * message header
122 */ 121 */
123struct ceph_msg_header { 122struct ceph_msg_header_old {
124 __le64 seq; /* message seq# for this session */ 123 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */ 124 __le64 tid; /* transaction id */
126 __le16 type; /* message type */ 125 __le16 type; /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
138 __le32 crc; /* header crc32c */ 137 __le32 crc; /* header crc32c */
139} __attribute__ ((packed)); 138} __attribute__ ((packed));
140 139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
141#define CEPH_MSG_PRIO_LOW 64 158#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127 159#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196 160#define CEPH_MSG_PRIO_HIGH 196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85f..d25b4add85b4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
16#define OSD_OP_FRONT_LEN 4096 16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512 17#define OSD_OPREPLY_FRONT_LEN 512
18 18
19const static struct ceph_connection_operations osd_con_ops; 19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc, 20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd); 21 struct ceph_osd *kickosd);
22 22
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
147 req = kzalloc(sizeof(*req), GFP_NOFS); 147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 } 148 }
149 if (req == NULL) 149 if (req == NULL)
150 return ERR_PTR(-ENOMEM); 150 return NULL;
151 151
152 req->r_osdc = osdc; 152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool; 153 req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else 165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); 167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (IS_ERR(msg)) { 168 if (!msg) {
169 ceph_osdc_put_request(req); 169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg)); 170 return NULL;
171 } 171 }
172 req->r_reply = msg; 172 req->r_reply = msg;
173 173
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
178 if (use_mempool) 178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else 180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); 181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (IS_ERR(msg)) { 182 if (!msg) {
183 ceph_osdc_put_request(req); 183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg)); 184 return NULL;
185 } 185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); 186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len); 187 memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
361{ 361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1); 363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref)) 364 if (atomic_dec_and_test(&osd->o_ref)) {
365 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
366
367 if (osd->o_authorizer)
368 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
365 kfree(osd); 369 kfree(osd);
370 }
366} 371}
367 372
368/* 373/*
@@ -715,7 +720,7 @@ static void handle_timeout(struct work_struct *work)
715 * should mark the osd as failed and we should find out about 720 * should mark the osd as failed and we should find out about
716 * it from an updated osd map. 721 * it from an updated osd map.
717 */ 722 */
718 while (!list_empty(&osdc->req_lru)) { 723 while (timeout && !list_empty(&osdc->req_lru)) {
719 req = list_entry(osdc->req_lru.next, struct ceph_osd_request, 724 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
720 r_req_lru_item); 725 r_req_lru_item);
721 726
@@ -1078,6 +1083,7 @@ done:
1078 if (newmap) 1083 if (newmap)
1079 kick_requests(osdc, NULL); 1084 kick_requests(osdc, NULL);
1080 up_read(&osdc->map_sem); 1085 up_read(&osdc->map_sem);
1086 wake_up(&osdc->client->auth_wq);
1081 return; 1087 return;
1082 1088
1083bad: 1089bad:
@@ -1087,45 +1093,6 @@ bad:
1087 return; 1093 return;
1088} 1094}
1089 1095
1090
1091/*
1092 * A read request prepares specific pages that data is to be read into.
1093 * When a message is being read off the wire, we call prepare_pages to
1094 * find those pages.
1095 * 0 = success, -1 failure.
1096 */
1097static int __prepare_pages(struct ceph_connection *con,
1098 struct ceph_msg_header *hdr,
1099 struct ceph_osd_request *req,
1100 u64 tid,
1101 struct ceph_msg *m)
1102{
1103 struct ceph_osd *osd = con->private;
1104 struct ceph_osd_client *osdc;
1105 int ret = -1;
1106 int data_len = le32_to_cpu(hdr->data_len);
1107 unsigned data_off = le16_to_cpu(hdr->data_off);
1108
1109 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1110
1111 if (!osd)
1112 return -1;
1113
1114 osdc = osd->o_osdc;
1115
1116 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1117 tid, req->r_num_pages, want);
1118 if (unlikely(req->r_num_pages < want))
1119 goto out;
1120 m->pages = req->r_pages;
1121 m->nr_pages = req->r_num_pages;
1122 ret = 0; /* success */
1123out:
1124 BUG_ON(ret < 0 || m->nr_pages < want);
1125
1126 return ret;
1127}
1128
1129/* 1096/*
1130 * Register request, send initial attempt. 1097 * Register request, send initial attempt.
1131 */ 1098 */
@@ -1252,11 +1219,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1252 if (!osdc->req_mempool) 1219 if (!osdc->req_mempool)
1253 goto out; 1220 goto out;
1254 1221
1255 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); 1222 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1223 "osd_op");
1256 if (err < 0) 1224 if (err < 0)
1257 goto out_mempool; 1225 goto out_mempool;
1258 err = ceph_msgpool_init(&osdc->msgpool_op_reply, 1226 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1259 OSD_OPREPLY_FRONT_LEN, 10, true); 1227 OSD_OPREPLY_FRONT_LEN, 10, true,
1228 "osd_op_reply");
1260 if (err < 0) 1229 if (err < 0)
1261 goto out_msgpool; 1230 goto out_msgpool;
1262 return 0; 1231 return 0;
@@ -1302,8 +1271,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1302 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1271 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1303 NULL, 0, truncate_seq, truncate_size, NULL, 1272 NULL, 0, truncate_seq, truncate_size, NULL,
1304 false, 1); 1273 false, 1);
1305 if (IS_ERR(req)) 1274 if (!req)
1306 return PTR_ERR(req); 1275 return -ENOMEM;
1307 1276
1308 /* it may be a short read due to an object boundary */ 1277 /* it may be a short read due to an object boundary */
1309 req->r_pages = pages; 1278 req->r_pages = pages;
@@ -1345,8 +1314,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1345 snapc, do_sync, 1314 snapc, do_sync,
1346 truncate_seq, truncate_size, mtime, 1315 truncate_seq, truncate_size, mtime,
1347 nofail, 1); 1316 nofail, 1);
1348 if (IS_ERR(req)) 1317 if (!req)
1349 return PTR_ERR(req); 1318 return -ENOMEM;
1350 1319
1351 /* it may be a short write due to an object boundary */ 1320 /* it may be a short write due to an object boundary */
1352 req->r_pages = pages; 1321 req->r_pages = pages;
@@ -1394,7 +1363,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1394} 1363}
1395 1364
1396/* 1365/*
1397 * lookup and return message for incoming reply 1366 * lookup and return message for incoming reply. set up reply message
1367 * pages.
1398 */ 1368 */
1399static struct ceph_msg *get_reply(struct ceph_connection *con, 1369static struct ceph_msg *get_reply(struct ceph_connection *con,
1400 struct ceph_msg_header *hdr, 1370 struct ceph_msg_header *hdr,
@@ -1407,7 +1377,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1407 int front = le32_to_cpu(hdr->front_len); 1377 int front = le32_to_cpu(hdr->front_len);
1408 int data_len = le32_to_cpu(hdr->data_len); 1378 int data_len = le32_to_cpu(hdr->data_len);
1409 u64 tid; 1379 u64 tid;
1410 int err;
1411 1380
1412 tid = le64_to_cpu(hdr->tid); 1381 tid = le64_to_cpu(hdr->tid);
1413 mutex_lock(&osdc->request_mutex); 1382 mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1394,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1425 req->r_reply, req->r_con_filling_msg); 1394 req->r_reply, req->r_con_filling_msg);
1426 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); 1395 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1427 ceph_con_put(req->r_con_filling_msg); 1396 ceph_con_put(req->r_con_filling_msg);
1397 req->r_con_filling_msg = NULL;
1428 } 1398 }
1429 1399
1430 if (front > req->r_reply->front.iov_len) { 1400 if (front > req->r_reply->front.iov_len) {
1431 pr_warning("get_reply front %d > preallocated %d\n", 1401 pr_warning("get_reply front %d > preallocated %d\n",
1432 front, (int)req->r_reply->front.iov_len); 1402 front, (int)req->r_reply->front.iov_len);
1433 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); 1403 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1434 if (IS_ERR(m)) 1404 if (!m)
1435 goto out; 1405 goto out;
1436 ceph_msg_put(req->r_reply); 1406 ceph_msg_put(req->r_reply);
1437 req->r_reply = m; 1407 req->r_reply = m;
@@ -1439,12 +1409,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1439 m = ceph_msg_get(req->r_reply); 1409 m = ceph_msg_get(req->r_reply);
1440 1410
1441 if (data_len > 0) { 1411 if (data_len > 0) {
1442 err = __prepare_pages(con, hdr, req, tid, m); 1412 unsigned data_off = le16_to_cpu(hdr->data_off);
1443 if (err < 0) { 1413 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1414
1415 if (unlikely(req->r_num_pages < want)) {
1416 pr_warning("tid %lld reply %d > expected %d pages\n",
1417 tid, want, m->nr_pages);
1444 *skip = 1; 1418 *skip = 1;
1445 ceph_msg_put(m); 1419 ceph_msg_put(m);
1446 m = ERR_PTR(err); 1420 m = NULL;
1421 goto out;
1447 } 1422 }
1423 m->pages = req->r_pages;
1424 m->nr_pages = req->r_num_pages;
1448 } 1425 }
1449 *skip = 0; 1426 *skip = 0;
1450 req->r_con_filling_msg = ceph_con_get(con); 1427 req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1443,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1466 1443
1467 switch (type) { 1444 switch (type) {
1468 case CEPH_MSG_OSD_MAP: 1445 case CEPH_MSG_OSD_MAP:
1469 return ceph_msg_new(type, front, 0, 0, NULL); 1446 return ceph_msg_new(type, front, GFP_NOFS);
1470 case CEPH_MSG_OSD_OPREPLY: 1447 case CEPH_MSG_OSD_OPREPLY:
1471 return get_reply(con, hdr, skip); 1448 return get_reply(con, hdr, skip);
1472 default: 1449 default:
@@ -1552,7 +1529,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
1552 return ceph_monc_validate_auth(&osdc->client->monc); 1529 return ceph_monc_validate_auth(&osdc->client->monc);
1553} 1530}
1554 1531
1555const static struct ceph_connection_operations osd_con_ops = { 1532static const struct ceph_connection_operations osd_con_ops = {
1556 .get = get_osd_con, 1533 .get = get_osd_con,
1557 .put = put_osd_con, 1534 .put = put_osd_con,
1558 .dispatch = dispatch, 1535 .dispatch = dispatch,
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index cfdd8f4388b7..ddc656fb5c05 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -706,7 +706,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
706 len, *p, end); 706 len, *p, end);
707 newcrush = crush_decode(*p, min(*p+len, end)); 707 newcrush = crush_decode(*p, min(*p+len, end));
708 if (IS_ERR(newcrush)) 708 if (IS_ERR(newcrush))
709 return ERR_PTR(PTR_ERR(newcrush)); 709 return ERR_CAST(newcrush);
710 } 710 }
711 711
712 /* new flags? */ 712 /* new flags? */
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745a..b6859f47d364 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
20 20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl) 21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{ 22{
23 struct page *page = alloc_page(GFP_NOFS); 23 struct page *page = __page_cache_alloc(GFP_NOFS);
24 if (!page) 24 if (!page)
25 return -ENOMEM; 25 return -ENOMEM;
26 pl->room += PAGE_SIZE; 26 pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871f..8fcc023056c7 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
101 __le64 snap_seq; /* seq for per-pool snapshot */ 101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */ 102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps; 103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; 104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 uid; 105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed)); 106} __attribute__ ((packed));
107 107
108/* 108/*
@@ -208,6 +208,7 @@ enum {
208 /* read */ 208 /* read */
209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, 210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
211 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
211 212
212 /* write */ 213 /* write */
213 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, 214 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
305#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 306#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
306#define EBLACKLISTED ESHUTDOWN /* blacklisted */ 307#define EBLACKLISTED ESHUTDOWN /* blacklisted */
307 308
309/* xattr comparison */
310enum {
311 CEPH_OSD_CMPXATTR_OP_NOP = 0,
312 CEPH_OSD_CMPXATTR_OP_EQ = 1,
313 CEPH_OSD_CMPXATTR_OP_NE = 2,
314 CEPH_OSD_CMPXATTR_OP_GT = 3,
315 CEPH_OSD_CMPXATTR_OP_GTE = 4,
316 CEPH_OSD_CMPXATTR_OP_LT = 5,
317 CEPH_OSD_CMPXATTR_OP_LTE = 6
318};
319
320enum {
321 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
322 CEPH_OSD_CMPXATTR_MODE_U64 = 2
323};
324
308/* 325/*
309 * an individual object operation. each may be accompanied by some data 326 * an individual object operation. each may be accompanied by some data
310 * payload 327 * payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
321 struct { 338 struct {
322 __le32 name_len; 339 __le32 name_len;
323 __le32 value_len; 340 __le32 value_len;
341 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
342 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
324 } __attribute__ ((packed)) xattr; 343 } __attribute__ ((packed)) xattr;
325 struct { 344 struct {
326 __u8 class_len; 345 __u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db70453..c0b26b6badba 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap) 512 struct ceph_cap_snap *capsnap)
513{ 513{
514 struct inode *inode = &ci->vfs_inode; 514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 515 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
516 516
517 BUG_ON(capsnap->writing); 517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size; 518 capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9307bbee6fbe..4e0bee240b9d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/parser.h> 10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
13#include <linux/seq_file.h> 12#include <linux/seq_file.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/statfs.h> 14#include <linux/statfs.h>
16#include <linux/string.h> 15#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19 16
20#include "decode.h" 17#include "decode.h"
21#include "super.h" 18#include "super.h"
@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
107static int ceph_syncfs(struct super_block *sb, int wait) 104static int ceph_syncfs(struct super_block *sb, int wait)
108{ 105{
109 dout("sync_fs %d\n", wait); 106 dout("sync_fs %d\n", wait);
110 ceph_osdc_sync(&ceph_client(sb)->osdc); 107 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
111 ceph_mdsc_sync(&ceph_client(sb)->mdsc); 108 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
112 dout("sync_fs %d done\n", wait); 109 dout("sync_fs %d done\n", wait);
113 return 0; 110 return 0;
114} 111}
115 112
113static int default_congestion_kb(void)
114{
115 int congestion_kb;
116
117 /*
118 * Copied from NFS
119 *
120 * congestion size, scale with available memory.
121 *
122 * 64MB: 8192k
123 * 128MB: 11585k
124 * 256MB: 16384k
125 * 512MB: 23170k
126 * 1GB: 32768k
127 * 2GB: 46340k
128 * 4GB: 65536k
129 * 8GB: 92681k
130 * 16GB: 131072k
131 *
132 * This allows larger machines to have larger/more transfers.
133 * Limit the default to 256M
134 */
135 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
136 if (congestion_kb > 256*1024)
137 congestion_kb = 256*1024;
138
139 return congestion_kb;
140}
116 141
117/** 142/**
118 * ceph_show_options - Show mount options in /proc/mounts 143 * ceph_show_options - Show mount options in /proc/mounts
@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
138 seq_puts(m, ",nocrc"); 163 seq_puts(m, ",nocrc");
139 if (args->flags & CEPH_OPT_NOASYNCREADDIR) 164 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
140 seq_puts(m, ",noasyncreaddir"); 165 seq_puts(m, ",noasyncreaddir");
166
167 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
168 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
169 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
170 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
171 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
172 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
173 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
174 seq_printf(m, ",osdkeepalivetimeout=%d",
175 args->osd_keepalive_timeout);
176 if (args->wsize)
177 seq_printf(m, ",wsize=%d", args->wsize);
178 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
179 seq_printf(m, ",rsize=%d", args->rsize);
180 if (args->congestion_kb != default_congestion_kb())
181 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
182 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
183 seq_printf(m, ",caps_wanted_delay_min=%d",
184 args->caps_wanted_delay_min);
185 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
186 seq_printf(m, ",caps_wanted_delay_max=%d",
187 args->caps_wanted_delay_max);
188 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
189 seq_printf(m, ",cap_release_safety=%d",
190 args->cap_release_safety);
191 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
192 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
193 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
194 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
141 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) 195 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
142 seq_printf(m, ",snapdirname=%s", args->snapdir_name); 196 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
143 if (args->name) 197 if (args->name)
@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
161 inode_init_once(&ci->vfs_inode); 215 inode_init_once(&ci->vfs_inode);
162} 216}
163 217
164static int default_congestion_kb(void)
165{
166 int congestion_kb;
167
168 /*
169 * Copied from NFS
170 *
171 * congestion size, scale with available memory.
172 *
173 * 64MB: 8192k
174 * 128MB: 11585k
175 * 256MB: 16384k
176 * 512MB: 23170k
177 * 1GB: 32768k
178 * 2GB: 46340k
179 * 4GB: 65536k
180 * 8GB: 92681k
181 * 16GB: 131072k
182 *
183 * This allows larger machines to have larger/more transfers.
184 * Limit the default to 256M
185 */
186 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
187 if (congestion_kb > 256*1024)
188 congestion_kb = 256*1024;
189
190 return congestion_kb;
191}
192
193static int __init init_caches(void) 218static int __init init_caches(void)
194{ 219{
195 ceph_inode_cachep = kmem_cache_create("ceph_inode_info", 220 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -308,7 +333,9 @@ enum {
308 Opt_osd_idle_ttl, 333 Opt_osd_idle_ttl,
309 Opt_caps_wanted_delay_min, 334 Opt_caps_wanted_delay_min,
310 Opt_caps_wanted_delay_max, 335 Opt_caps_wanted_delay_max,
336 Opt_cap_release_safety,
311 Opt_readdir_max_entries, 337 Opt_readdir_max_entries,
338 Opt_readdir_max_bytes,
312 Opt_congestion_kb, 339 Opt_congestion_kb,
313 Opt_last_int, 340 Opt_last_int,
314 /* int args above */ 341 /* int args above */
@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
339 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, 366 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
340 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 367 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
341 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 368 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
369 {Opt_cap_release_safety, "cap_release_safety=%d"},
342 {Opt_readdir_max_entries, "readdir_max_entries=%d"}, 370 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
371 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
343 {Opt_congestion_kb, "write_congestion_kb=%d"}, 372 {Opt_congestion_kb, "write_congestion_kb=%d"},
344 /* int args above */ 373 /* int args above */
345 {Opt_snapdirname, "snapdirname=%s"}, 374 {Opt_snapdirname, "snapdirname=%s"},
@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
388 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 417 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
389 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 418 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
390 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 419 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
391 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; 420 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
392 args->max_readdir = 1024; 421 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
422 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
393 args->congestion_kb = default_congestion_kb(); 423 args->congestion_kb = default_congestion_kb();
394 424
395 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 425 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
497 case Opt_readdir_max_entries: 527 case Opt_readdir_max_entries:
498 args->max_readdir = intval; 528 args->max_readdir = intval;
499 break; 529 break;
530 case Opt_readdir_max_bytes:
531 args->max_readdir_bytes = intval;
532 break;
500 case Opt_congestion_kb: 533 case Opt_congestion_kb:
501 args->congestion_kb = intval; 534 args->congestion_kb = intval;
502 break; 535 break;
@@ -636,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client)
636 669
637 /* unmount */ 670 /* unmount */
638 ceph_mdsc_stop(&client->mdsc); 671 ceph_mdsc_stop(&client->mdsc);
639 ceph_monc_stop(&client->monc);
640 ceph_osdc_stop(&client->osdc); 672 ceph_osdc_stop(&client->osdc);
641 673
674 /*
675 * make sure mds and osd connections close out before destroying
676 * the auth module, which is needed to free those connections'
677 * ceph_authorizers.
678 */
679 ceph_msgr_flush();
680
681 ceph_monc_stop(&client->monc);
682
642 ceph_adjust_min_caps(-client->min_caps); 683 ceph_adjust_min_caps(-client->min_caps);
643 684
644 ceph_debugfs_client_cleanup(client); 685 ceph_debugfs_client_cleanup(client);
@@ -682,9 +723,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
682/* 723/*
683 * true if we have the mon map (and have thus joined the cluster) 724 * true if we have the mon map (and have thus joined the cluster)
684 */ 725 */
685static int have_mon_map(struct ceph_client *client) 726static int have_mon_and_osd_map(struct ceph_client *client)
686{ 727{
687 return client->monc.monmap && client->monc.monmap->epoch; 728 return client->monc.monmap && client->monc.monmap->epoch &&
729 client->osdc.osdmap && client->osdc.osdmap->epoch;
688} 730}
689 731
690/* 732/*
@@ -704,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
704 dout("open_root_inode opening '%s'\n", path); 746 dout("open_root_inode opening '%s'\n", path);
705 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); 747 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
706 if (IS_ERR(req)) 748 if (IS_ERR(req))
707 return ERR_PTR(PTR_ERR(req)); 749 return ERR_CAST(req);
708 req->r_path1 = kstrdup(path, GFP_NOFS); 750 req->r_path1 = kstrdup(path, GFP_NOFS);
709 req->r_ino1.ino = CEPH_INO_ROOT; 751 req->r_ino1.ino = CEPH_INO_ROOT;
710 req->r_ino1.snap = CEPH_NOSNAP; 752 req->r_ino1.snap = CEPH_NOSNAP;
@@ -762,7 +804,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
762 if (err < 0) 804 if (err < 0)
763 goto out; 805 goto out;
764 806
765 while (!have_mon_map(client)) { 807 while (!have_mon_and_osd_map(client)) {
766 err = -EIO; 808 err = -EIO;
767 if (timeout && time_after_eq(jiffies, started + timeout)) 809 if (timeout && time_after_eq(jiffies, started + timeout))
768 goto out; 810 goto out;
@@ -770,8 +812,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
770 /* wait */ 812 /* wait */
771 dout("mount waiting for mon_map\n"); 813 dout("mount waiting for mon_map\n");
772 err = wait_event_interruptible_timeout(client->auth_wq, 814 err = wait_event_interruptible_timeout(client->auth_wq,
773 have_mon_map(client) || (client->auth_err < 0), 815 have_mon_and_osd_map(client) || (client->auth_err < 0),
774 timeout); 816 timeout);
775 if (err == -EINTR || err == -ERESTARTSYS) 817 if (err == -EINTR || err == -ERESTARTSYS)
776 goto out; 818 goto out;
777 if (client->auth_err < 0) { 819 if (client->auth_err < 0) {
@@ -884,6 +926,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
884/* 926/*
885 * construct our own bdi so we can control readahead, etc. 927 * construct our own bdi so we can control readahead, etc.
886 */ 928 */
929static atomic_long_t bdi_seq = ATOMIC_INIT(0);
930
887static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 931static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
888{ 932{
889 int err; 933 int err;
@@ -893,7 +937,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
893 client->backing_dev_info.ra_pages = 937 client->backing_dev_info.ra_pages =
894 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 938 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
895 >> PAGE_SHIFT; 939 >> PAGE_SHIFT;
896 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); 940 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
941 atomic_long_inc_return(&bdi_seq));
897 if (!err) 942 if (!err)
898 sb->s_bdi = &client->backing_dev_info; 943 sb->s_bdi = &client->backing_dev_info;
899 return err; 944 return err;
@@ -932,9 +977,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
932 goto out; 977 goto out;
933 } 978 }
934 979
935 if (ceph_client(sb) != client) { 980 if (ceph_sb_to_client(sb) != client) {
936 ceph_destroy_client(client); 981 ceph_destroy_client(client);
937 client = ceph_client(sb); 982 client = ceph_sb_to_client(sb);
938 dout("get_sb got existing client %p\n", client); 983 dout("get_sb got existing client %p\n", client);
939 } else { 984 } else {
940 dout("get_sb using new client %p\n", client); 985 dout("get_sb using new client %p\n", client);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87f..10a4a406e887 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,7 +10,6 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mempool.h> 11#include <linux/mempool.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/slab.h>
14#include <linux/wait.h> 13#include <linux/wait.h>
15#include <linux/writeback.h> 14#include <linux/writeback.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
@@ -52,24 +51,25 @@
52 51
53struct ceph_mount_args { 52struct ceph_mount_args {
54 int sb_flags; 53 int sb_flags;
54 int flags;
55 struct ceph_fsid fsid;
56 struct ceph_entity_addr my_addr;
55 int num_mon; 57 int num_mon;
56 struct ceph_entity_addr *mon_addr; 58 struct ceph_entity_addr *mon_addr;
57 int flags;
58 int mount_timeout; 59 int mount_timeout;
59 int osd_idle_ttl; 60 int osd_idle_ttl;
60 int caps_wanted_delay_min, caps_wanted_delay_max;
61 struct ceph_fsid fsid;
62 struct ceph_entity_addr my_addr;
63 int wsize;
64 int rsize; /* max readahead */
65 int max_readdir; /* max readdir size */
66 int congestion_kb; /* max readdir size */
67 int osd_timeout; 61 int osd_timeout;
68 int osd_keepalive_timeout; 62 int osd_keepalive_timeout;
63 int wsize;
64 int rsize; /* max readahead */
65 int congestion_kb; /* max writeback in flight */
66 int caps_wanted_delay_min, caps_wanted_delay_max;
67 int cap_release_safety;
68 int max_readdir; /* max readdir result (entires) */
69 int max_readdir_bytes; /* max readdir result (bytes) */
69 char *snapdir_name; /* default ".snap" */ 70 char *snapdir_name; /* default ".snap" */
70 char *name; 71 char *name;
71 char *secret; 72 char *secret;
72 int cap_release_safety;
73}; 73};
74 74
75/* 75/*
@@ -80,13 +80,14 @@ struct ceph_mount_args {
80#define CEPH_OSD_KEEPALIVE_DEFAULT 5 80#define CEPH_OSD_KEEPALIVE_DEFAULT 5
81#define CEPH_OSD_IDLE_TTL_DEFAULT 60 81#define CEPH_OSD_IDLE_TTL_DEFAULT 60
82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ 82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
83#define CEPH_MAX_READDIR_DEFAULT 1024
84#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
83 85
84#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 86#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
85#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) 87#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
86 88
87#define CEPH_SNAPDIRNAME_DEFAULT ".snap" 89#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
88#define CEPH_AUTH_NAME_DEFAULT "guest" 90#define CEPH_AUTH_NAME_DEFAULT "guest"
89
90/* 91/*
91 * Delay telling the MDS we no longer want caps, in case we reopen 92 * Delay telling the MDS we no longer want caps, in case we reopen
92 * the file. Delay a minimum amount of time, even if we send a cap 93 * the file. Delay a minimum amount of time, even if we send a cap
@@ -96,6 +97,7 @@ struct ceph_mount_args {
96#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ 97#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
97#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ 98#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
98 99
100#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
99 101
100/* mount state */ 102/* mount state */
101enum { 103enum {
@@ -160,12 +162,6 @@ struct ceph_client {
160#endif 162#endif
161}; 163};
162 164
163static inline struct ceph_client *ceph_client(struct super_block *sb)
164{
165 return sb->s_fs_info;
166}
167
168
169/* 165/*
170 * File i/o capability. This tracks shared state with the metadata 166 * File i/o capability. This tracks shared state with the metadata
171 * server that allows us to cache or writeback attributes or to read 167 * server that allows us to cache or writeback attributes or to read
@@ -814,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap);
814 810
815extern void ceph_queue_caps_release(struct inode *inode); 811extern void ceph_queue_caps_release(struct inode *inode);
816extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); 812extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
817extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync); 813extern int ceph_fsync(struct file *file, int datasync);
818extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 814extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
819 struct ceph_mds_session *session); 815 struct ceph_mds_session *session);
820extern int ceph_get_cap_mds(struct inode *inode); 816extern int ceph_get_cap_mds(struct inode *inode);
@@ -871,6 +867,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
871extern void ceph_dentry_lru_add(struct dentry *dn); 867extern void ceph_dentry_lru_add(struct dentry *dn);
872extern void ceph_dentry_lru_touch(struct dentry *dn); 868extern void ceph_dentry_lru_touch(struct dentry *dn);
873extern void ceph_dentry_lru_del(struct dentry *dn); 869extern void ceph_dentry_lru_del(struct dentry *dn);
870extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
874 871
875/* 872/*
876 * our d_ops vary depending on whether the inode is live, 873 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907fc..68aeebc69681 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
7 7
8static bool ceph_is_valid_xattr(const char *name) 8static bool ceph_is_valid_xattr(const char *name)
9{ 9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX, 10 return !strncmp(name, "ceph.", 5) ||
11 !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) || 12 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 13 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 14 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
76} 77}
77 78
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { 79static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, 80 { true, "ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files}, 81 { true, "ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, 82 { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, 83 { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, 84 { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, 85 { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, 86 { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, 87 { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL } 88 { true, NULL, NULL }
88}; 89};
89 90
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
107} 108}
108 109
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = { 110static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout}, 111 { true, "ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL } 112 { NULL, NULL }
112}; 113};
113 114
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
186 ci->i_xattrs.names_size -= xattr->name_len; 187 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len; 188 ci->i_xattrs.vals_size -= xattr->val_len;
188 } 189 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len; 190 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len; 191 ci->i_xattrs.vals_size += val_len;
197 if (val) 192 if (val)
@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
574 ci->i_xattrs.version, ci->i_xattrs.index_version); 569 ci->i_xattrs.version, ci->i_xattrs.index_version);
575 570
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 571 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) { 572 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
578 goto list_xattr; 573 goto list_xattr;
579 } else { 574 } else {
580 spin_unlock(&inode->i_lock); 575 spin_unlock(&inode->i_lock);
@@ -622,7 +617,7 @@ out:
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 617static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags) 618 const char *value, size_t size, int flags)
624{ 619{
625 struct ceph_client *client = ceph_client(dentry->d_sb); 620 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode; 621 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode); 622 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode; 623 struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
641 return -ENOMEM; 636 return -ENOMEM;
642 err = -ENOMEM; 637 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) { 638 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS); 639 pages[i] = __page_cache_alloc(GFP_NOFS);
645 if (!pages[i]) { 640 if (!pages[i]) {
646 nr_pages = i; 641 nr_pages = i;
647 goto out; 642 goto out;
@@ -779,7 +774,7 @@ out:
779 774
780static int ceph_send_removexattr(struct dentry *dentry, const char *name) 775static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{ 776{
782 struct ceph_client *client = ceph_client(dentry->d_sb); 777 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc; 778 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode; 779 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode; 780 struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0242ff9cbf41..a7eb65c84b1c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
85 size_t write_size, loff_t *poffset); 85 size_t write_size, loff_t *poffset);
86extern int cifs_lock(struct file *, int, struct file_lock *); 86extern int cifs_lock(struct file *, int, struct file_lock *);
87extern int cifs_fsync(struct file *, struct dentry *, int); 87extern int cifs_fsync(struct file *, int);
88extern int cifs_flush(struct file *, fl_owner_t id); 88extern int cifs_flush(struct file *, fl_owner_t id);
89extern int cifs_file_mmap(struct file * , struct vm_area_struct *); 89extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
90extern const struct file_operations cifs_dir_ops; 90extern const struct file_operations cifs_dir_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a83541ec9713..f1ff785b2292 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1676,7 +1676,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1676 return rc; 1676 return rc;
1677} 1677}
1678 1678
1679int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1679int cifs_fsync(struct file *file, int datasync)
1680{ 1680{
1681 int xid; 1681 int xid;
1682 int rc = 0; 1682 int rc = 0;
@@ -1688,7 +1688,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1688 xid = GetXid(); 1688 xid = GetXid();
1689 1689
1690 cFYI(1, "Sync file - name: %s datasync: 0x%x", 1690 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1691 dentry->d_name.name, datasync); 1691 file->f_path.dentry->d_name.name, datasync);
1692 1692
1693 rc = filemap_write_and_wait(inode->i_mapping); 1693 rc = filemap_write_and_wait(inode->i_mapping);
1694 if (rc == 0) { 1694 if (rc == 0) {
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
11 11
12void coda_destroy_inodecache(void); 12void coda_destroy_inodecache(void);
13int coda_init_inodecache(void); 13int coda_init_inodecache(void);
14int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, 14int coda_fsync(struct file *coda_file, int datasync);
15 int datasync);
16void coda_sysctl_init(void); 15void coda_sysctl_init(void);
17void coda_sysctl_clean(void); 16void coda_sysctl_clean(void);
18 17
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7196077b1688..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
202 return 0; 202 return 0;
203} 203}
204 204
205int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) 205int coda_fsync(struct file *coda_file, int datasync)
206{ 206{
207 struct file *host_file; 207 struct file *host_file;
208 struct inode *coda_inode = coda_dentry->d_inode; 208 struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
209 struct coda_file_info *cfi; 209 struct coda_file_info *cfi;
210 int err = 0; 210 int err = 0;
211 211
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Pioctl operations for Coda. 2 * Pioctl operations for Coda.
3 * Original version: (C) 1996 Peter Braam 3 * Original version: (C) 1996 Peter Braam
4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University 4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
5 * 5 *
6 * Carnegie Mellon encourages users of this code to contribute improvements 6 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
23#include <linux/coda_fs_i.h> 23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
25 25
26#include <linux/smp_lock.h>
27
26/* pioctl ops */ 28/* pioctl ops */
27static int coda_ioctl_permission(struct inode *inode, int mask); 29static int coda_ioctl_permission(struct inode *inode, int mask);
28static int coda_pioctl(struct inode * inode, struct file * filp, 30static long coda_pioctl(struct file *filp, unsigned int cmd,
29 unsigned int cmd, unsigned long user_data); 31 unsigned long user_data);
30 32
31/* exported from this file */ 33/* exported from this file */
32const struct inode_operations coda_ioctl_inode_operations = 34const struct inode_operations coda_ioctl_inode_operations = {
33{
34 .permission = coda_ioctl_permission, 35 .permission = coda_ioctl_permission,
35 .setattr = coda_setattr, 36 .setattr = coda_setattr,
36}; 37};
37 38
38const struct file_operations coda_ioctl_operations = { 39const struct file_operations coda_ioctl_operations = {
39 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
40 .ioctl = coda_pioctl, 41 .unlocked_ioctl = coda_pioctl,
41}; 42};
42 43
43/* the coda pioctl inode ops */ 44/* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
46 return (mask & MAY_EXEC) ? -EACCES : 0; 47 return (mask & MAY_EXEC) ? -EACCES : 0;
47} 48}
48 49
49static int coda_pioctl(struct inode * inode, struct file * filp, 50static long coda_pioctl(struct file *filp, unsigned int cmd,
50 unsigned int cmd, unsigned long user_data) 51 unsigned long user_data)
51{ 52{
52 struct path path; 53 struct path path;
53 int error; 54 int error;
54 struct PioctlData data; 55 struct PioctlData data;
55 struct inode *target_inode = NULL; 56 struct inode *inode = filp->f_dentry->d_inode;
56 struct coda_inode_info *cnp; 57 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp;
57 59
58 /* get the Pioctl data arguments from user space */ 60 lock_kernel();
59 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 61
60 return -EINVAL; 62 /* get the Pioctl data arguments from user space */
61 } 63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
62 64 error = -EINVAL;
63 /* 65 goto out;
64 * Look up the pathname. Note that the pathname is in
65 * user memory, and namei takes care of this
66 */
67 if (data.follow) {
68 error = user_path(data.path, &path);
69 } else {
70 error = user_lpath(data.path, &path);
71 } 66 }
72 67
73 if ( error ) { 68 /*
74 return error; 69 * Look up the pathname. Note that the pathname is in
75 } else { 70 * user memory, and namei takes care of this
71 */
72 if (data.follow)
73 error = user_path(data.path, &path);
74 else
75 error = user_lpath(data.path, &path);
76
77 if (error)
78 goto out;
79 else
76 target_inode = path.dentry->d_inode; 80 target_inode = path.dentry->d_inode;
77 } 81
78
79 /* return if it is not a Coda inode */ 82 /* return if it is not a Coda inode */
80 if ( target_inode->i_sb != inode->i_sb ) { 83 if (target_inode->i_sb != inode->i_sb) {
81 path_put(&path); 84 path_put(&path);
82 return -EINVAL; 85 error = -EINVAL;
86 goto out;
83 } 87 }
84 88
85 /* now proceed to make the upcall */ 89 /* now proceed to make the upcall */
86 cnp = ITOC(target_inode); 90 cnp = ITOC(target_inode);
87 91
88 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
89 93
90 path_put(&path); 94 path_put(&path);
91 return error;
92}
93 95
96out:
97 unlock_kernel();
98 return error;
99}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..66b9cf79c5ba 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
73 return mask; 73 return mask;
74} 74}
75 75
76static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 76static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
77 unsigned int cmd, unsigned long arg)
78{ 77{
79 unsigned int data; 78 unsigned int data;
80 79
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
344 .read = coda_psdev_read, 343 .read = coda_psdev_read,
345 .write = coda_psdev_write, 344 .write = coda_psdev_write,
346 .poll = coda_psdev_poll, 345 .poll = coda_psdev_poll,
347 .ioctl = coda_psdev_ioctl, 346 .unlocked_ioctl = coda_psdev_ioctl,
348 .open = coda_psdev_open, 347 .open = coda_psdev_open,
349 .release = coda_psdev_release, 348 .release = coda_psdev_release,
350}; 349};
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..f0b391c50552 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -568,6 +568,79 @@ out:
568 return ret; 568 return ret;
569} 569}
570 570
571/* A write operation does a read from user space and vice versa */
572#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
573
574ssize_t compat_rw_copy_check_uvector(int type,
575 const struct compat_iovec __user *uvector, unsigned long nr_segs,
576 unsigned long fast_segs, struct iovec *fast_pointer,
577 struct iovec **ret_pointer)
578{
579 compat_ssize_t tot_len;
580 struct iovec *iov = *ret_pointer = fast_pointer;
581 ssize_t ret = 0;
582 int seg;
583
584 /*
585 * SuS says "The readv() function *may* fail if the iovcnt argument
586 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
587 * traditionally returned zero for zero segments, so...
588 */
589 if (nr_segs == 0)
590 goto out;
591
592 ret = -EINVAL;
593 if (nr_segs > UIO_MAXIOV || nr_segs < 0)
594 goto out;
595 if (nr_segs > fast_segs) {
596 ret = -ENOMEM;
597 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
598 if (iov == NULL) {
599 *ret_pointer = fast_pointer;
600 goto out;
601 }
602 }
603 *ret_pointer = iov;
604
605 /*
606 * Single unix specification:
607 * We should -EINVAL if an element length is not >= 0 and fitting an
608 * ssize_t. The total length is fitting an ssize_t
609 *
610 * Be careful here because iov_len is a size_t not an ssize_t
611 */
612 tot_len = 0;
613 ret = -EINVAL;
614 for (seg = 0; seg < nr_segs; seg++) {
615 compat_ssize_t tmp = tot_len;
616 compat_uptr_t buf;
617 compat_ssize_t len;
618
619 if (__get_user(len, &uvector->iov_len) ||
620 __get_user(buf, &uvector->iov_base)) {
621 ret = -EFAULT;
622 goto out;
623 }
624 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
625 goto out;
626 tot_len += len;
627 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
628 goto out;
629 if (!access_ok(vrfy_dir(type), buf, len)) {
630 ret = -EFAULT;
631 goto out;
632 }
633 iov->iov_base = compat_ptr(buf);
634 iov->iov_len = (compat_size_t) len;
635 uvector++;
636 iov++;
637 }
638 ret = tot_len;
639
640out:
641 return ret;
642}
643
571static inline long 644static inline long
572copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) 645copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
573{ 646{
@@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
600 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); 673 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
601 ret = copy_iocb(nr, iocb, iocb64); 674 ret = copy_iocb(nr, iocb, iocb64);
602 if (!ret) 675 if (!ret)
603 ret = sys_io_submit(ctx_id, nr, iocb64); 676 ret = do_io_submit(ctx_id, nr, iocb64, 1);
604 return ret; 677 return ret;
605} 678}
606 679
@@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1077{ 1150{
1078 compat_ssize_t tot_len; 1151 compat_ssize_t tot_len;
1079 struct iovec iovstack[UIO_FASTIOV]; 1152 struct iovec iovstack[UIO_FASTIOV];
1080 struct iovec *iov=iovstack, *vector; 1153 struct iovec *iov;
1081 ssize_t ret; 1154 ssize_t ret;
1082 int seg;
1083 io_fn_t fn; 1155 io_fn_t fn;
1084 iov_fn_t fnv; 1156 iov_fn_t fnv;
1085 1157
1086 /*
1087 * SuS says "The readv() function *may* fail if the iovcnt argument
1088 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
1089 * traditionally returned zero for zero segments, so...
1090 */
1091 ret = 0;
1092 if (nr_segs == 0)
1093 goto out;
1094
1095 /*
1096 * First get the "struct iovec" from user memory and
1097 * verify all the pointers
1098 */
1099 ret = -EINVAL; 1158 ret = -EINVAL;
1100 if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
1101 goto out;
1102 if (!file->f_op) 1159 if (!file->f_op)
1103 goto out; 1160 goto out;
1104 if (nr_segs > UIO_FASTIOV) { 1161
1105 ret = -ENOMEM;
1106 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
1107 if (!iov)
1108 goto out;
1109 }
1110 ret = -EFAULT; 1162 ret = -EFAULT;
1111 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) 1163 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
1112 goto out; 1164 goto out;
1113 1165
1114 /* 1166 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1115 * Single unix specification: 1167 UIO_FASTIOV, iovstack, &iov);
1116 * We should -EINVAL if an element length is not >= 0 and fitting an
1117 * ssize_t. The total length is fitting an ssize_t
1118 *
1119 * Be careful here because iov_len is a size_t not an ssize_t
1120 */
1121 tot_len = 0;
1122 vector = iov;
1123 ret = -EINVAL;
1124 for (seg = 0 ; seg < nr_segs; seg++) {
1125 compat_ssize_t tmp = tot_len;
1126 compat_ssize_t len;
1127 compat_uptr_t buf;
1128
1129 if (__get_user(len, &uvector->iov_len) ||
1130 __get_user(buf, &uvector->iov_base)) {
1131 ret = -EFAULT;
1132 goto out;
1133 }
1134 if (len < 0) /* size_t not fitting an compat_ssize_t .. */
1135 goto out;
1136 tot_len += len;
1137 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
1138 goto out;
1139 vector->iov_base = compat_ptr(buf);
1140 vector->iov_len = (compat_size_t) len;
1141 uvector++;
1142 vector++;
1143 }
1144 if (tot_len == 0) { 1168 if (tot_len == 0) {
1145 ret = 0; 1169 ret = 0;
1146 goto out; 1170 goto out;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c8af2d91174b..41645142b88b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -72,16 +72,11 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
72 if (!sd) 72 if (!sd)
73 return -EINVAL; 73 return -EINVAL;
74 74
75 sd_iattr = sd->s_iattr; 75 error = simple_setattr(dentry, iattr);
76
77 error = inode_change_ok(inode, iattr);
78 if (error)
79 return error;
80
81 error = inode_setattr(inode, iattr);
82 if (error) 76 if (error)
83 return error; 77 return error;
84 78
79 sd_iattr = sd->s_iattr;
85 if (!sd_iattr) { 80 if (!sd_iattr) {
86 /* setting attributes for the first time, allocate now */ 81 /* setting attributes for the first time, allocate now */
87 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); 82 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); 277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); 278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
279 279
280DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
281
280/* 282/*
281 * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value 283 * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
282 * 284 *
283 * These functions are exactly the same as the above functions (but use a hex 285 * These functions are exactly the same as the above functions (but use a hex
284 * output for the decimal challenged). For details look at the above unsigned 286 * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
357} 359}
358EXPORT_SYMBOL_GPL(debugfs_create_x32); 360EXPORT_SYMBOL_GPL(debugfs_create_x32);
359 361
362/**
363 * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
364 * @name: a pointer to a string containing the name of the file to create.
365 * @mode: the permission that the file should have
366 * @parent: a pointer to the parent dentry for this file. This should be a
367 * directory dentry if set. If this parameter is %NULL, then the
368 * file will be created in the root of the debugfs filesystem.
369 * @value: a pointer to the variable that the file should read to and write
370 * from.
371 */
372struct dentry *debugfs_create_x64(const char *name, mode_t mode,
373 struct dentry *parent, u64 *value)
374{
375 return debugfs_create_file(name, mode, parent, value, &fops_x64);
376}
377EXPORT_SYMBOL_GPL(debugfs_create_x64);
378
360 379
361static int debugfs_size_t_set(void *data, u64 val) 380static int debugfs_size_t_set(void *data, u64 val)
362{ 381{
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..7600aacf531d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
82 int reap_counter; /* rate limit reaping */ 82 int reap_counter; /* rate limit reaping */
83 get_block_t *get_block; /* block mapping function */ 83 get_block_t *get_block; /* block mapping function */
84 dio_iodone_t *end_io; /* IO completion function */ 84 dio_iodone_t *end_io; /* IO completion function */
85 dio_submit_t *submit_io; /* IO submition function */
86 loff_t logical_offset_in_bio; /* current first logical block in bio */
85 sector_t final_block_in_bio; /* current final block in bio + 1 */ 87 sector_t final_block_in_bio; /* current final block in bio + 1 */
86 sector_t next_block_for_io; /* next block to be put under IO, 88 sector_t next_block_for_io; /* next block to be put under IO,
87 in dio_blocks units */ 89 in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
96 unsigned cur_page_offset; /* Offset into it, in bytes */ 98 unsigned cur_page_offset; /* Offset into it, in bytes */
97 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 99 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
98 sector_t cur_page_block; /* Where it starts */ 100 sector_t cur_page_block; /* Where it starts */
101 loff_t cur_page_fs_offset; /* Offset in file */
99 102
100 /* BIO completion state */ 103 /* BIO completion state */
101 spinlock_t bio_lock; /* protects BIO fields below */ 104 spinlock_t bio_lock; /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
300 spin_unlock_irqrestore(&dio->bio_lock, flags); 303 spin_unlock_irqrestore(&dio->bio_lock, flags);
301} 304}
302 305
306/**
307 * dio_end_io - handle the end io action for the given bio
308 * @bio: The direct io bio thats being completed
309 * @error: Error if there was one
310 *
311 * This is meant to be called by any filesystem that uses their own dio_submit_t
312 * so that the DIO specific endio actions are dealt with after the filesystem
313 * has done it's completion work.
314 */
315void dio_end_io(struct bio *bio, int error)
316{
317 struct dio *dio = bio->bi_private;
318
319 if (dio->is_async)
320 dio_bio_end_aio(bio, error);
321 else
322 dio_bio_end_io(bio, error);
323}
324EXPORT_SYMBOL_GPL(dio_end_io);
325
303static int 326static int
304dio_bio_alloc(struct dio *dio, struct block_device *bdev, 327dio_bio_alloc(struct dio *dio, struct block_device *bdev,
305 sector_t first_sector, int nr_vecs) 328 sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
316 bio->bi_end_io = dio_bio_end_io; 339 bio->bi_end_io = dio_bio_end_io;
317 340
318 dio->bio = bio; 341 dio->bio = bio;
342 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
319 return 0; 343 return 0;
320} 344}
321 345
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
340 if (dio->is_async && dio->rw == READ) 364 if (dio->is_async && dio->rw == READ)
341 bio_set_pages_dirty(bio); 365 bio_set_pages_dirty(bio);
342 366
343 submit_bio(dio->rw, bio); 367 if (dio->submit_io)
368 dio->submit_io(dio->rw, bio, dio->inode,
369 dio->logical_offset_in_bio);
370 else
371 submit_bio(dio->rw, bio);
344 372
345 dio->bio = NULL; 373 dio->bio = NULL;
346 dio->boundary = 0; 374 dio->boundary = 0;
375 dio->logical_offset_in_bio = 0;
347} 376}
348 377
349/* 378/*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
603 int ret = 0; 632 int ret = 0;
604 633
605 if (dio->bio) { 634 if (dio->bio) {
635 loff_t cur_offset = dio->block_in_file << dio->blkbits;
636 loff_t bio_next_offset = dio->logical_offset_in_bio +
637 dio->bio->bi_size;
638
606 /* 639 /*
607 * See whether this new request is contiguous with the old 640 * See whether this new request is contiguous with the old.
641 *
642 * Btrfs cannot handl having logically non-contiguous requests
643 * submitted. For exmple if you have
644 *
645 * Logical: [0-4095][HOLE][8192-12287]
646 * Phyiscal: [0-4095] [4096-8181]
647 *
648 * We cannot submit those pages together as one BIO. So if our
649 * current logical offset in the file does not equal what would
650 * be the next logical offset in the bio, submit the bio we
651 * have.
608 */ 652 */
609 if (dio->final_block_in_bio != dio->cur_page_block) 653 if (dio->final_block_in_bio != dio->cur_page_block ||
654 cur_offset != bio_next_offset)
610 dio_bio_submit(dio); 655 dio_bio_submit(dio);
611 /* 656 /*
612 * Submit now if the underlying fs is about to perform a 657 * Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
701 dio->cur_page_offset = offset; 746 dio->cur_page_offset = offset;
702 dio->cur_page_len = len; 747 dio->cur_page_len = len;
703 dio->cur_page_block = blocknr; 748 dio->cur_page_block = blocknr;
749 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
704out: 750out:
705 return ret; 751 return ret;
706} 752}
@@ -935,7 +981,7 @@ static ssize_t
935direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 981direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
936 const struct iovec *iov, loff_t offset, unsigned long nr_segs, 982 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
937 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, 983 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
938 struct dio *dio) 984 dio_submit_t submit_io, struct dio *dio)
939{ 985{
940 unsigned long user_addr; 986 unsigned long user_addr;
941 unsigned long flags; 987 unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
952 998
953 dio->get_block = get_block; 999 dio->get_block = get_block;
954 dio->end_io = end_io; 1000 dio->end_io = end_io;
1001 dio->submit_io = submit_io;
955 dio->final_block_in_bio = -1; 1002 dio->final_block_in_bio = -1;
956 dio->next_block_for_io = -1; 1003 dio->next_block_for_io = -1;
957 1004
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1008 } 1055 }
1009 } /* end iovec loop */ 1056 } /* end iovec loop */
1010 1057
1011 if (ret == -ENOTBLK && (rw & WRITE)) { 1058 if (ret == -ENOTBLK) {
1012 /* 1059 /*
1013 * The remaining part of the request will be 1060 * The remaining part of the request will be
1014 * be handled by buffered I/O when we return 1061 * be handled by buffered I/O when we return
@@ -1087,30 +1134,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1087 return ret; 1134 return ret;
1088} 1135}
1089 1136
1090/*
1091 * This is a library function for use by filesystem drivers.
1092 *
1093 * The locking rules are governed by the flags parameter:
1094 * - if the flags value contains DIO_LOCKING we use a fancy locking
1095 * scheme for dumb filesystems.
1096 * For writes this function is called under i_mutex and returns with
1097 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1098 * taken and dropped again before returning.
1099 * For reads and writes i_alloc_sem is taken in shared mode and released
1100 * on I/O completion (which may happen asynchronously after returning to
1101 * the caller).
1102 *
1103 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1104 * internal locking but rather rely on the filesystem to synchronize
1105 * direct I/O reads/writes versus each other and truncate.
1106 * For reads and writes both i_mutex and i_alloc_sem are not held on
1107 * entry and are never taken.
1108 */
1109ssize_t 1137ssize_t
1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1138__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
1111 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1139 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1140 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1113 int flags) 1141 dio_submit_t submit_io, int flags)
1114{ 1142{
1115 int seg; 1143 int seg;
1116 size_t size; 1144 size_t size;
@@ -1197,11 +1225,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1197 (end > i_size_read(inode))); 1225 (end > i_size_read(inode)));
1198 1226
1199 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1227 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1200 nr_segs, blkbits, get_block, end_io, dio); 1228 nr_segs, blkbits, get_block, end_io,
1229 submit_io, dio);
1230
1231out:
1232 return retval;
1233}
1234EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
1235
1236/*
1237 * This is a library function for use by filesystem drivers.
1238 *
1239 * The locking rules are governed by the flags parameter:
1240 * - if the flags value contains DIO_LOCKING we use a fancy locking
1241 * scheme for dumb filesystems.
1242 * For writes this function is called under i_mutex and returns with
1243 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1244 * taken and dropped again before returning.
1245 * For reads and writes i_alloc_sem is taken in shared mode and released
1246 * on I/O completion (which may happen asynchronously after returning to
1247 * the caller).
1248 *
1249 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1250 * internal locking but rather rely on the filesystem to synchronize
1251 * direct I/O reads/writes versus each other and truncate.
1252 * For reads and writes both i_mutex and i_alloc_sem are not held on
1253 * entry and are never taken.
1254 */
1255ssize_t
1256__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1257 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1258 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1259 dio_submit_t submit_io, int flags)
1260{
1261 ssize_t retval;
1201 1262
1263 retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
1264 offset, nr_segs, get_block, end_io, submit_io, flags);
1202 /* 1265 /*
1203 * In case of error extending write may have instantiated a few 1266 * In case of error extending write may have instantiated a few
1204 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1267 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1268 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
1269 * their own manner. This is a further example of where the old
1270 * truncate sequence is inadequate.
1205 * 1271 *
1206 * NOTE: filesystems with their own locking have to handle this 1272 * NOTE: filesystems with their own locking have to handle this
1207 * on their own. 1273 * on their own.
@@ -1209,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1209 if (flags & DIO_LOCKING) { 1275 if (flags & DIO_LOCKING) {
1210 if (unlikely((rw & WRITE) && retval < 0)) { 1276 if (unlikely((rw & WRITE) && retval < 0)) {
1211 loff_t isize = i_size_read(inode); 1277 loff_t isize = i_size_read(inode);
1278 loff_t end = offset + iov_length(iov, nr_segs);
1279
1212 if (end > isize) 1280 if (end > isize)
1213 vmtruncate(inode, isize); 1281 vmtruncate(inode, isize);
1214 } 1282 }
1215 } 1283 }
1216 1284
1217out:
1218 return retval; 1285 return retval;
1219} 1286}
1220EXPORT_SYMBOL(__blockdev_direct_IO); 1287EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 3bdddbcc785f..e8fcf4e2ed7d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -274,7 +274,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
274} 274}
275 275
276static int 276static int
277ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 277ecryptfs_fsync(struct file *file, int datasync)
278{ 278{
279 return vfs_fsync(ecryptfs_file_to_lower(file), datasync); 279 return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
280} 280}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 65dee2f336ae..31ef5252f0fe 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -805,7 +805,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
805 - (ia->ia_size & ~PAGE_CACHE_MASK)); 805 - (ia->ia_size & ~PAGE_CACHE_MASK));
806 806
807 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 807 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
808 rc = vmtruncate(inode, ia->ia_size); 808 rc = simple_setsize(inode, ia->ia_size);
809 if (rc) 809 if (rc)
810 goto out; 810 goto out;
811 lower_ia->ia_size = ia->ia_size; 811 lower_ia->ia_size = ia->ia_size;
@@ -830,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
830 goto out; 830 goto out;
831 } 831 }
832 } 832 }
833 vmtruncate(inode, ia->ia_size); 833 simple_setsize(inode, ia->ia_size);
834 rc = ecryptfs_write_inode_size_to_metadata(inode); 834 rc = ecryptfs_write_inode_size_to_metadata(inode);
835 if (rc) { 835 if (rc) {
836 printk(KERN_ERR "Problem with " 836 printk(KERN_ERR "Problem with "
diff --git a/fs/exec.c b/fs/exec.c
index e6e94c626c2c..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
242 * use STACK_TOP because that can depend on attributes which aren't 242 * use STACK_TOP because that can depend on attributes which aren't
243 * configured yet. 243 * configured yet.
244 */ 244 */
245 BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
245 vma->vm_end = STACK_TOP_MAX; 246 vma->vm_end = STACK_TOP_MAX;
246 vma->vm_start = vma->vm_end - PAGE_SIZE; 247 vma->vm_start = vma->vm_end - PAGE_SIZE;
247 vma->vm_flags = VM_STACK_FLAGS; 248 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 249 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain); 250 INIT_LIST_HEAD(&vma->anon_vma_chain);
250 err = insert_vm_struct(mm, vma); 251 err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
616 else if (executable_stack == EXSTACK_DISABLE_X) 617 else if (executable_stack == EXSTACK_DISABLE_X)
617 vm_flags &= ~VM_EXEC; 618 vm_flags &= ~VM_EXEC;
618 vm_flags |= mm->def_flags; 619 vm_flags |= mm->def_flags;
620 vm_flags |= VM_STACK_INCOMPLETE_SETUP;
619 621
620 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, 622 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
621 vm_flags); 623 vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
630 goto out_unlock; 632 goto out_unlock;
631 } 633 }
632 634
635 /* mprotect_fixup is overkill to remove the temporary stack flags */
636 vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
637
633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ 638 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start; 639 stack_size = vma->vm_end - vma->vm_start;
635 /* 640 /*
@@ -763,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
763 struct signal_struct *sig = tsk->signal; 768 struct signal_struct *sig = tsk->signal;
764 struct sighand_struct *oldsighand = tsk->sighand; 769 struct sighand_struct *oldsighand = tsk->sighand;
765 spinlock_t *lock = &oldsighand->siglock; 770 spinlock_t *lock = &oldsighand->siglock;
766 int count;
767 771
768 if (thread_group_empty(tsk)) 772 if (thread_group_empty(tsk))
769 goto no_thread_group; 773 goto no_thread_group;
@@ -780,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
780 spin_unlock_irq(lock); 784 spin_unlock_irq(lock);
781 return -EAGAIN; 785 return -EAGAIN;
782 } 786 }
787
783 sig->group_exit_task = tsk; 788 sig->group_exit_task = tsk;
784 zap_other_threads(tsk); 789 sig->notify_count = zap_other_threads(tsk);
790 if (!thread_group_leader(tsk))
791 sig->notify_count--;
785 792
786 /* Account for the thread group leader hanging around: */ 793 while (sig->notify_count) {
787 count = thread_group_leader(tsk) ? 1 : 2;
788 sig->notify_count = count;
789 while (atomic_read(&sig->count) > count) {
790 __set_current_state(TASK_UNINTERRUPTIBLE); 794 __set_current_state(TASK_UNINTERRUPTIBLE);
791 spin_unlock_irq(lock); 795 spin_unlock_irq(lock);
792 schedule(); 796 schedule();
@@ -1657,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
1657 struct task_struct *tsk = current; 1661 struct task_struct *tsk = current;
1658 struct mm_struct *mm = tsk->mm; 1662 struct mm_struct *mm = tsk->mm;
1659 struct completion *vfork_done; 1663 struct completion *vfork_done;
1660 int core_waiters; 1664 int core_waiters = -EBUSY;
1661 1665
1662 init_completion(&core_state->startup); 1666 init_completion(&core_state->startup);
1663 core_state->dumper.task = tsk; 1667 core_state->dumper.task = tsk;
1664 core_state->dumper.next = NULL; 1668 core_state->dumper.next = NULL;
1665 core_waiters = zap_threads(tsk, mm, core_state, exit_code); 1669
1670 down_write(&mm->mmap_sem);
1671 if (!mm->core_state)
1672 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1666 up_write(&mm->mmap_sem); 1673 up_write(&mm->mmap_sem);
1667 1674
1668 if (unlikely(core_waiters < 0)) 1675 if (unlikely(core_waiters < 0))
@@ -1782,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
1782} 1789}
1783 1790
1784 1791
1792/*
1793 * uhm_pipe_setup
1794 * helper function to customize the process used
1795 * to collect the core in userspace. Specifically
1796 * it sets up a pipe and installs it as fd 0 (stdin)
1797 * for the process. Returns 0 on success, or
1798 * PTR_ERR on failure.
1799 * Note that it also sets the core limit to 1. This
1800 * is a special value that we use to trap recursive
1801 * core dumps
1802 */
1803static int umh_pipe_setup(struct subprocess_info *info)
1804{
1805 struct file *rp, *wp;
1806 struct fdtable *fdt;
1807 struct coredump_params *cp = (struct coredump_params *)info->data;
1808 struct files_struct *cf = current->files;
1809
1810 wp = create_write_pipe(0);
1811 if (IS_ERR(wp))
1812 return PTR_ERR(wp);
1813
1814 rp = create_read_pipe(wp, 0);
1815 if (IS_ERR(rp)) {
1816 free_write_pipe(wp);
1817 return PTR_ERR(rp);
1818 }
1819
1820 cp->file = wp;
1821
1822 sys_close(0);
1823 fd_install(0, rp);
1824 spin_lock(&cf->file_lock);
1825 fdt = files_fdtable(cf);
1826 FD_SET(0, fdt->open_fds);
1827 FD_CLR(0, fdt->close_on_exec);
1828 spin_unlock(&cf->file_lock);
1829
1830 /* and disallow core files too */
1831 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
1832
1833 return 0;
1834}
1835
1785void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1836void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1786{ 1837{
1787 struct core_state core_state; 1838 struct core_state core_state;
1788 char corename[CORENAME_MAX_SIZE + 1]; 1839 char corename[CORENAME_MAX_SIZE + 1];
1789 struct mm_struct *mm = current->mm; 1840 struct mm_struct *mm = current->mm;
1790 struct linux_binfmt * binfmt; 1841 struct linux_binfmt * binfmt;
1791 struct inode * inode;
1792 const struct cred *old_cred; 1842 const struct cred *old_cred;
1793 struct cred *cred; 1843 struct cred *cred;
1794 int retval = 0; 1844 int retval = 0;
1795 int flag = 0; 1845 int flag = 0;
1796 int ispipe = 0; 1846 int ispipe;
1797 char **helper_argv = NULL;
1798 int helper_argc = 0;
1799 int dump_count = 0;
1800 static atomic_t core_dump_count = ATOMIC_INIT(0); 1847 static atomic_t core_dump_count = ATOMIC_INIT(0);
1801 struct coredump_params cprm = { 1848 struct coredump_params cprm = {
1802 .signr = signr, 1849 .signr = signr,
@@ -1815,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1815 binfmt = mm->binfmt; 1862 binfmt = mm->binfmt;
1816 if (!binfmt || !binfmt->core_dump) 1863 if (!binfmt || !binfmt->core_dump)
1817 goto fail; 1864 goto fail;
1818 1865 if (!__get_dumpable(cprm.mm_flags))
1819 cred = prepare_creds();
1820 if (!cred) {
1821 retval = -ENOMEM;
1822 goto fail; 1866 goto fail;
1823 }
1824 1867
1825 down_write(&mm->mmap_sem); 1868 cred = prepare_creds();
1826 /* 1869 if (!cred)
1827 * If another thread got here first, or we are not dumpable, bail out.
1828 */
1829 if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1830 up_write(&mm->mmap_sem);
1831 put_cred(cred);
1832 goto fail; 1870 goto fail;
1833 }
1834
1835 /* 1871 /*
1836 * We cannot trust fsuid as being the "true" uid of the 1872 * We cannot trust fsuid as being the "true" uid of the
1837 * process nor do we know its entire history. We only know it 1873 * process nor do we know its entire history. We only know it
@@ -1844,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1844 } 1880 }
1845 1881
1846 retval = coredump_wait(exit_code, &core_state); 1882 retval = coredump_wait(exit_code, &core_state);
1847 if (retval < 0) { 1883 if (retval < 0)
1848 put_cred(cred); 1884 goto fail_creds;
1849 goto fail;
1850 }
1851 1885
1852 old_cred = override_creds(cred); 1886 old_cred = override_creds(cred);
1853 1887
@@ -1865,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1865 ispipe = format_corename(corename, signr); 1899 ispipe = format_corename(corename, signr);
1866 unlock_kernel(); 1900 unlock_kernel();
1867 1901
1868 if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
1869 goto fail_unlock;
1870
1871 if (ispipe) { 1902 if (ispipe) {
1872 if (cprm.limit == 0) { 1903 int dump_count;
1904 char **helper_argv;
1905
1906 if (cprm.limit == 1) {
1873 /* 1907 /*
1874 * Normally core limits are irrelevant to pipes, since 1908 * Normally core limits are irrelevant to pipes, since
1875 * we're not writing to the file system, but we use 1909 * we're not writing to the file system, but we use
1876 * cprm.limit of 0 here as a speacial value. Any 1910 * cprm.limit of 1 here as a speacial value. Any
1877 * non-zero limit gets set to RLIM_INFINITY below, but 1911 * non-1 limit gets set to RLIM_INFINITY below, but
1878 * a limit of 0 skips the dump. This is a consistent 1912 * a limit of 0 skips the dump. This is a consistent
1879 * way to catch recursive crashes. We can still crash 1913 * way to catch recursive crashes. We can still crash
1880 * if the core_pattern binary sets RLIM_CORE = !0 1914 * if the core_pattern binary sets RLIM_CORE = !1
1881 * but it runs as root, and can do lots of stupid things 1915 * but it runs as root, and can do lots of stupid things
1882 * Note that we use task_tgid_vnr here to grab the pid 1916 * Note that we use task_tgid_vnr here to grab the pid
1883 * of the process group leader. That way we get the 1917 * of the process group leader. That way we get the
@@ -1885,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1885 * core_pattern process dies. 1919 * core_pattern process dies.
1886 */ 1920 */
1887 printk(KERN_WARNING 1921 printk(KERN_WARNING
1888 "Process %d(%s) has RLIMIT_CORE set to 0\n", 1922 "Process %d(%s) has RLIMIT_CORE set to 1\n",
1889 task_tgid_vnr(current), current->comm); 1923 task_tgid_vnr(current), current->comm);
1890 printk(KERN_WARNING "Aborting core\n"); 1924 printk(KERN_WARNING "Aborting core\n");
1891 goto fail_unlock; 1925 goto fail_unlock;
1892 } 1926 }
1927 cprm.limit = RLIM_INFINITY;
1893 1928
1894 dump_count = atomic_inc_return(&core_dump_count); 1929 dump_count = atomic_inc_return(&core_dump_count);
1895 if (core_pipe_limit && (core_pipe_limit < dump_count)) { 1930 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1899,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1899 goto fail_dropcount; 1934 goto fail_dropcount;
1900 } 1935 }
1901 1936
1902 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1937 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
1903 if (!helper_argv) { 1938 if (!helper_argv) {
1904 printk(KERN_WARNING "%s failed to allocate memory\n", 1939 printk(KERN_WARNING "%s failed to allocate memory\n",
1905 __func__); 1940 __func__);
1906 goto fail_dropcount; 1941 goto fail_dropcount;
1907 } 1942 }
1908 1943
1909 cprm.limit = RLIM_INFINITY; 1944 retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
1910 1945 NULL, UMH_WAIT_EXEC, umh_pipe_setup,
1911 /* SIGPIPE can happen, but it's just never processed */ 1946 NULL, &cprm);
1912 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, 1947 argv_free(helper_argv);
1913 &cprm.file)) { 1948 if (retval) {
1914 printk(KERN_INFO "Core dump to %s pipe failed\n", 1949 printk(KERN_INFO "Core dump to %s pipe failed\n",
1915 corename); 1950 corename);
1916 goto fail_dropcount; 1951 goto close_fail;
1917 } 1952 }
1918 } else 1953 } else {
1954 struct inode *inode;
1955
1956 if (cprm.limit < binfmt->min_coredump)
1957 goto fail_unlock;
1958
1919 cprm.file = filp_open(corename, 1959 cprm.file = filp_open(corename,
1920 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1960 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1921 0600); 1961 0600);
1922 if (IS_ERR(cprm.file)) 1962 if (IS_ERR(cprm.file))
1923 goto fail_dropcount; 1963 goto fail_unlock;
1924 inode = cprm.file->f_path.dentry->d_inode;
1925 if (inode->i_nlink > 1)
1926 goto close_fail; /* multiple links - don't dump */
1927 if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
1928 goto close_fail;
1929
1930 /* AK: actually i see no reason to not allow this for named pipes etc.,
1931 but keep the previous behaviour for now. */
1932 if (!ispipe && !S_ISREG(inode->i_mode))
1933 goto close_fail;
1934 /*
1935 * Dont allow local users get cute and trick others to coredump
1936 * into their pre-created files:
1937 * Note, this is not relevant for pipes
1938 */
1939 if (!ispipe && (inode->i_uid != current_fsuid()))
1940 goto close_fail;
1941 if (!cprm.file->f_op)
1942 goto close_fail;
1943 if (!cprm.file->f_op->write)
1944 goto close_fail;
1945 if (!ispipe &&
1946 do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
1947 goto close_fail;
1948 1964
1949 retval = binfmt->core_dump(&cprm); 1965 inode = cprm.file->f_path.dentry->d_inode;
1966 if (inode->i_nlink > 1)
1967 goto close_fail;
1968 if (d_unhashed(cprm.file->f_path.dentry))
1969 goto close_fail;
1970 /*
1971 * AK: actually i see no reason to not allow this for named
1972 * pipes etc, but keep the previous behaviour for now.
1973 */
1974 if (!S_ISREG(inode->i_mode))
1975 goto close_fail;
1976 /*
1977 * Dont allow local users get cute and trick others to coredump
1978 * into their pre-created files.
1979 */
1980 if (inode->i_uid != current_fsuid())
1981 goto close_fail;
1982 if (!cprm.file->f_op || !cprm.file->f_op->write)
1983 goto close_fail;
1984 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
1985 goto close_fail;
1986 }
1950 1987
1988 retval = binfmt->core_dump(&cprm);
1951 if (retval) 1989 if (retval)
1952 current->signal->group_exit_code |= 0x80; 1990 current->signal->group_exit_code |= 0x80;
1953close_fail: 1991
1954 if (ispipe && core_pipe_limit) 1992 if (ispipe && core_pipe_limit)
1955 wait_for_dump_helpers(cprm.file); 1993 wait_for_dump_helpers(cprm.file);
1956 filp_close(cprm.file, NULL); 1994close_fail:
1995 if (cprm.file)
1996 filp_close(cprm.file, NULL);
1957fail_dropcount: 1997fail_dropcount:
1958 if (dump_count) 1998 if (ispipe)
1959 atomic_dec(&core_dump_count); 1999 atomic_dec(&core_dump_count);
1960fail_unlock: 2000fail_unlock:
1961 if (helper_argv) 2001 coredump_finish(mm);
1962 argv_free(helper_argv);
1963
1964 revert_creds(old_cred); 2002 revert_creds(old_cred);
2003fail_creds:
1965 put_cred(cred); 2004 put_cred(cred);
1966 coredump_finish(mm);
1967fail: 2005fail:
1968 return; 2006 return;
1969} 2007}
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
608 de->inode_no = cpu_to_le64(parent->i_ino); 608 de->inode_no = cpu_to_le64(parent->i_ino);
609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); 609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
610 exofs_set_de_type(de, inode); 610 exofs_set_de_type(de, inode);
611 kunmap_atomic(page, KM_USER0); 611 kunmap_atomic(kaddr, KM_USER0);
612 err = exofs_commit_chunk(page, 0, chunk_size); 612 err = exofs_commit_chunk(page, 0, chunk_size);
613fail: 613fail:
614 page_cache_release(page); 614 page_cache_release(page);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..fef6899be397 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
40 return 0; 40 return 0;
41} 41}
42 42
43static int exofs_file_fsync(struct file *filp, struct dentry *dentry, 43static int exofs_file_fsync(struct file *filp, int datasync)
44 int datasync)
45{ 44{
46 int ret; 45 int ret;
47 struct address_space *mapping = filp->f_mapping; 46 struct address_space *mapping = filp->f_mapping;
48 struct inode *inode = dentry->d_inode; 47 struct inode *inode = mapping->host;
49 struct super_block *sb; 48 struct super_block *sb;
50 49
51 ret = filemap_write_and_wait(mapping); 50 ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
66 65
67static int exofs_flush(struct file *file, fl_owner_t id) 66static int exofs_flush(struct file *file, fl_owner_t id)
68{ 67{
69 exofs_file_fsync(file, file->f_path.dentry, 1); 68 exofs_file_fsync(file, 1);
70 /* TODO: Flush the OSD target */ 69 /* TODO: Flush the OSD target */
71 return 0; 70 return 0;
72} 71}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d7c6afa79754..4bb6ef822e46 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
755 return ret; 755 return ret;
756} 756}
757 757
758static int exofs_releasepage(struct page *page, gfp_t gfp)
759{
760 EXOFS_DBGMSG("page 0x%lx\n", page->index);
761 WARN_ON(1);
762 return try_to_free_buffers(page);
763}
764
765static void exofs_invalidatepage(struct page *page, unsigned long offset)
766{
767 EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
768 WARN_ON(1);
769
770 block_invalidatepage(page, offset);
771}
772
758const struct address_space_operations exofs_aops = { 773const struct address_space_operations exofs_aops = {
759 .readpage = exofs_readpage, 774 .readpage = exofs_readpage,
760 .readpages = exofs_readpages, 775 .readpages = exofs_readpages,
@@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = {
762 .writepages = exofs_writepages, 777 .writepages = exofs_writepages,
763 .write_begin = exofs_write_begin_export, 778 .write_begin = exofs_write_begin_export,
764 .write_end = exofs_write_end, 779 .write_end = exofs_write_end,
780 .releasepage = exofs_releasepage,
781 .set_page_dirty = __set_page_dirty_nobuffers,
782 .invalidatepage = exofs_invalidatepage,
783
784 /* Not implemented Yet */
785 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
786 .direct_IO = NULL, /* TODO: Should be trivial to do */
787
788 /* With these NULL has special meaning or default is not exported */
789 .sync_page = NULL,
790 .get_xip_mem = NULL,
791 .migratepage = NULL,
792 .launder_page = NULL,
793 .is_partially_uptodate = NULL,
794 .error_remove_page = NULL,
765}; 795};
766 796
767/****************************************************************************** 797/******************************************************************************
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..52b34f1d2738 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_delete_inode (struct inode *);
123extern int ext2_sync_inode (struct inode *); 123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
125extern void ext2_truncate (struct inode *);
126extern int ext2_setattr (struct dentry *, struct iattr *); 125extern int ext2_setattr (struct dentry *, struct iattr *);
127extern void ext2_set_inode_flags(struct inode *inode); 126extern void ext2_set_inode_flags(struct inode *inode);
128extern void ext2_get_inode_flags(struct ext2_inode_info *); 127extern void ext2_get_inode_flags(struct ext2_inode_info *);
@@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *);
155extern const struct file_operations ext2_dir_operations; 154extern const struct file_operations ext2_dir_operations;
156 155
157/* file.c */ 156/* file.c */
158extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync); 157extern int ext2_fsync(struct file *file, int datasync);
159extern const struct inode_operations ext2_file_inode_operations; 158extern const struct inode_operations ext2_file_inode_operations;
160extern const struct file_operations ext2_file_operations; 159extern const struct file_operations ext2_file_operations;
161extern const struct file_operations ext2_xip_file_operations; 160extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
40 return 0; 40 return 0;
41} 41}
42 42
43int ext2_fsync(struct file *file, struct dentry *dentry, int datasync) 43int ext2_fsync(struct file *file, int datasync)
44{ 44{
45 int ret; 45 int ret;
46 struct super_block *sb = dentry->d_inode->i_sb; 46 struct super_block *sb = file->f_mapping->host->i_sb;
47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; 47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
48 48
49 ret = simple_fsync(file, dentry, datasync); 49 ret = generic_file_fsync(file, datasync);
50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { 50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
51 /* We don't really know where the IO error happened... */ 51 /* We don't really know where the IO error happened... */
52 ext2_error(sb, __func__, 52 ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
95#endif 95#endif
96 96
97const struct inode_operations ext2_file_inode_operations = { 97const struct inode_operations ext2_file_inode_operations = {
98 .truncate = ext2_truncate,
99#ifdef CONFIG_EXT2_FS_XATTR 98#ifdef CONFIG_EXT2_FS_XATTR
100 .setxattr = generic_setxattr, 99 .setxattr = generic_setxattr,
101 .getxattr = generic_getxattr, 100 .getxattr = generic_getxattr,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 527c46d9bc1f..19214435b752 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -54,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
54 inode->i_blocks - ea_blocks == 0); 54 inode->i_blocks - ea_blocks == 0);
55} 55}
56 56
57static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
58
59static void ext2_write_failed(struct address_space *mapping, loff_t to)
60{
61 struct inode *inode = mapping->host;
62
63 if (to > inode->i_size) {
64 truncate_pagecache(inode, to, inode->i_size);
65 ext2_truncate_blocks(inode, inode->i_size);
66 }
67}
68
57/* 69/*
58 * Called at the last iput() if i_nlink is zero. 70 * Called at the last iput() if i_nlink is zero.
59 */ 71 */
@@ -71,7 +83,7 @@ void ext2_delete_inode (struct inode * inode)
71 83
72 inode->i_size = 0; 84 inode->i_size = 0;
73 if (inode->i_blocks) 85 if (inode->i_blocks)
74 ext2_truncate (inode); 86 ext2_truncate_blocks(inode, 0);
75 ext2_free_inode (inode); 87 ext2_free_inode (inode);
76 88
77 return; 89 return;
@@ -757,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping,
757 loff_t pos, unsigned len, unsigned flags, 769 loff_t pos, unsigned len, unsigned flags,
758 struct page **pagep, void **fsdata) 770 struct page **pagep, void **fsdata)
759{ 771{
760 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 772 return block_write_begin_newtrunc(file, mapping, pos, len, flags,
761 ext2_get_block); 773 pagep, fsdata, ext2_get_block);
762} 774}
763 775
764static int 776static int
@@ -766,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
766 loff_t pos, unsigned len, unsigned flags, 778 loff_t pos, unsigned len, unsigned flags,
767 struct page **pagep, void **fsdata) 779 struct page **pagep, void **fsdata)
768{ 780{
781 int ret;
782
769 *pagep = NULL; 783 *pagep = NULL;
770 return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata); 784 ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
785 if (ret < 0)
786 ext2_write_failed(mapping, pos + len);
787 return ret;
788}
789
790static int ext2_write_end(struct file *file, struct address_space *mapping,
791 loff_t pos, unsigned len, unsigned copied,
792 struct page *page, void *fsdata)
793{
794 int ret;
795
796 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
797 if (ret < len)
798 ext2_write_failed(mapping, pos + len);
799 return ret;
771} 800}
772 801
773static int 802static int
@@ -775,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
775 loff_t pos, unsigned len, unsigned flags, 804 loff_t pos, unsigned len, unsigned flags,
776 struct page **pagep, void **fsdata) 805 struct page **pagep, void **fsdata)
777{ 806{
807 int ret;
808
778 /* 809 /*
779 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework 810 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
780 * directory handling code to pass around offsets rather than struct 811 * directory handling code to pass around offsets rather than struct
781 * pages in order to make this work easily. 812 * pages in order to make this work easily.
782 */ 813 */
783 return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 814 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
784 ext2_get_block); 815 fsdata, ext2_get_block);
816 if (ret < 0)
817 ext2_write_failed(mapping, pos + len);
818 return ret;
785} 819}
786 820
787static int ext2_nobh_writepage(struct page *page, 821static int ext2_nobh_writepage(struct page *page,
@@ -800,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
800 loff_t offset, unsigned long nr_segs) 834 loff_t offset, unsigned long nr_segs)
801{ 835{
802 struct file *file = iocb->ki_filp; 836 struct file *file = iocb->ki_filp;
803 struct inode *inode = file->f_mapping->host; 837 struct address_space *mapping = file->f_mapping;
804 838 struct inode *inode = mapping->host;
805 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 839 ssize_t ret;
806 offset, nr_segs, ext2_get_block, NULL); 840
841 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
842 iov, offset, nr_segs, ext2_get_block, NULL);
843 if (ret < 0 && (rw & WRITE))
844 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
845 return ret;
807} 846}
808 847
809static int 848static int
@@ -818,7 +857,7 @@ const struct address_space_operations ext2_aops = {
818 .writepage = ext2_writepage, 857 .writepage = ext2_writepage,
819 .sync_page = block_sync_page, 858 .sync_page = block_sync_page,
820 .write_begin = ext2_write_begin, 859 .write_begin = ext2_write_begin,
821 .write_end = generic_write_end, 860 .write_end = ext2_write_end,
822 .bmap = ext2_bmap, 861 .bmap = ext2_bmap,
823 .direct_IO = ext2_direct_IO, 862 .direct_IO = ext2_direct_IO,
824 .writepages = ext2_writepages, 863 .writepages = ext2_writepages,
@@ -1027,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
1027 ext2_free_data(inode, p, q); 1066 ext2_free_data(inode, p, q);
1028} 1067}
1029 1068
1030void ext2_truncate(struct inode *inode) 1069static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
1031{ 1070{
1032 __le32 *i_data = EXT2_I(inode)->i_data; 1071 __le32 *i_data = EXT2_I(inode)->i_data;
1033 struct ext2_inode_info *ei = EXT2_I(inode); 1072 struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1039,27 +1078,8 @@ void ext2_truncate(struct inode *inode)
1039 int n; 1078 int n;
1040 long iblock; 1079 long iblock;
1041 unsigned blocksize; 1080 unsigned blocksize;
1042
1043 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1044 S_ISLNK(inode->i_mode)))
1045 return;
1046 if (ext2_inode_is_fast_symlink(inode))
1047 return;
1048 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1049 return;
1050
1051 blocksize = inode->i_sb->s_blocksize; 1081 blocksize = inode->i_sb->s_blocksize;
1052 iblock = (inode->i_size + blocksize-1) 1082 iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1053 >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1054
1055 if (mapping_is_xip(inode->i_mapping))
1056 xip_truncate_page(inode->i_mapping, inode->i_size);
1057 else if (test_opt(inode->i_sb, NOBH))
1058 nobh_truncate_page(inode->i_mapping,
1059 inode->i_size, ext2_get_block);
1060 else
1061 block_truncate_page(inode->i_mapping,
1062 inode->i_size, ext2_get_block);
1063 1083
1064 n = ext2_block_to_path(inode, iblock, offsets, NULL); 1084 n = ext2_block_to_path(inode, iblock, offsets, NULL);
1065 if (n == 0) 1085 if (n == 0)
@@ -1127,6 +1147,62 @@ do_indirects:
1127 ext2_discard_reservation(inode); 1147 ext2_discard_reservation(inode);
1128 1148
1129 mutex_unlock(&ei->truncate_mutex); 1149 mutex_unlock(&ei->truncate_mutex);
1150}
1151
1152static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
1153{
1154 /*
1155 * XXX: it seems like a bug here that we don't allow
1156 * IS_APPEND inode to have blocks-past-i_size trimmed off.
1157 * review and fix this.
1158 *
1159 * Also would be nice to be able to handle IO errors and such,
1160 * but that's probably too much to ask.
1161 */
1162 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1163 S_ISLNK(inode->i_mode)))
1164 return;
1165 if (ext2_inode_is_fast_symlink(inode))
1166 return;
1167 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1168 return;
1169 __ext2_truncate_blocks(inode, offset);
1170}
1171
1172int ext2_setsize(struct inode *inode, loff_t newsize)
1173{
1174 loff_t oldsize;
1175 int error;
1176
1177 error = inode_newsize_ok(inode, newsize);
1178 if (error)
1179 return error;
1180
1181 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1182 S_ISLNK(inode->i_mode)))
1183 return -EINVAL;
1184 if (ext2_inode_is_fast_symlink(inode))
1185 return -EINVAL;
1186 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1187 return -EPERM;
1188
1189 if (mapping_is_xip(inode->i_mapping))
1190 error = xip_truncate_page(inode->i_mapping, newsize);
1191 else if (test_opt(inode->i_sb, NOBH))
1192 error = nobh_truncate_page(inode->i_mapping,
1193 newsize, ext2_get_block);
1194 else
1195 error = block_truncate_page(inode->i_mapping,
1196 newsize, ext2_get_block);
1197 if (error)
1198 return error;
1199
1200 oldsize = inode->i_size;
1201 i_size_write(inode, newsize);
1202 truncate_pagecache(inode, oldsize, newsize);
1203
1204 __ext2_truncate_blocks(inode, newsize);
1205
1130 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1206 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1131 if (inode_needs_sync(inode)) { 1207 if (inode_needs_sync(inode)) {
1132 sync_mapping_buffers(inode->i_mapping); 1208 sync_mapping_buffers(inode->i_mapping);
@@ -1134,6 +1210,8 @@ do_indirects:
1134 } else { 1210 } else {
1135 mark_inode_dirty(inode); 1211 mark_inode_dirty(inode);
1136 } 1212 }
1213
1214 return 0;
1137} 1215}
1138 1216
1139static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, 1217static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1474,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1474 if (error) 1552 if (error)
1475 return error; 1553 return error;
1476 } 1554 }
1477 error = inode_setattr(inode, iattr); 1555 if (iattr->ia_valid & ATTR_SIZE) {
1478 if (!error && (iattr->ia_valid & ATTR_MODE)) 1556 error = ext2_setsize(inode, iattr->ia_size);
1557 if (error)
1558 return error;
1559 }
1560 generic_setattr(inode, iattr);
1561 if (iattr->ia_valid & ATTR_MODE)
1479 error = ext2_acl_chmod(inode); 1562 error = ext2_acl_chmod(inode);
1563 mark_inode_dirty(inode);
1564
1480 return error; 1565 return error;
1481} 1566}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 71e9eb1fa696..7ff43f4a59cd 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -119,6 +119,8 @@ static void ext2_put_super (struct super_block * sb)
119 int i; 119 int i;
120 struct ext2_sb_info *sbi = EXT2_SB(sb); 120 struct ext2_sb_info *sbi = EXT2_SB(sb);
121 121
122 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
123
122 if (sb->s_dirt) 124 if (sb->s_dirt)
123 ext2_write_super(sb); 125 ext2_write_super(sb);
124 126
@@ -1063,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1063 sb->s_op = &ext2_sops; 1065 sb->s_op = &ext2_sops;
1064 sb->s_export_op = &ext2_export_ops; 1066 sb->s_export_op = &ext2_export_ops;
1065 sb->s_xattr = ext2_xattr_handlers; 1067 sb->s_xattr = ext2_xattr_handlers;
1068
1069#ifdef CONFIG_QUOTA
1070 sb->dq_op = &dquot_operations;
1071 sb->s_qcop = &dquot_quotactl_ops;
1072#endif
1073
1066 root = ext2_iget(sb, EXT2_ROOT_INO); 1074 root = ext2_iget(sb, EXT2_ROOT_INO);
1067 if (IS_ERR(root)) { 1075 if (IS_ERR(root)) {
1068 ret = PTR_ERR(root); 1076 ret = PTR_ERR(root);
@@ -1241,6 +1249,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1241 spin_unlock(&sbi->s_lock); 1249 spin_unlock(&sbi->s_lock);
1242 return 0; 1250 return 0;
1243 } 1251 }
1252
1244 /* 1253 /*
1245 * OK, we are remounting a valid rw partition rdonly, so set 1254 * OK, we are remounting a valid rw partition rdonly, so set
1246 * the rdonly flag and then mark the partition as valid again. 1255 * the rdonly flag and then mark the partition as valid again.
@@ -1248,6 +1257,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1248 es->s_state = cpu_to_le16(sbi->s_mount_state); 1257 es->s_state = cpu_to_le16(sbi->s_mount_state);
1249 es->s_mtime = cpu_to_le32(get_seconds()); 1258 es->s_mtime = cpu_to_le32(get_seconds());
1250 spin_unlock(&sbi->s_lock); 1259 spin_unlock(&sbi->s_lock);
1260
1261 err = dquot_suspend(sb, -1);
1262 if (err < 0) {
1263 spin_lock(&sbi->s_lock);
1264 goto restore_opts;
1265 }
1266
1251 ext2_sync_super(sb, es, 1); 1267 ext2_sync_super(sb, es, 1);
1252 } else { 1268 } else {
1253 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, 1269 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1269,8 +1285,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1269 if (!ext2_setup_super (sb, es, 0)) 1285 if (!ext2_setup_super (sb, es, 0))
1270 sb->s_flags &= ~MS_RDONLY; 1286 sb->s_flags &= ~MS_RDONLY;
1271 spin_unlock(&sbi->s_lock); 1287 spin_unlock(&sbi->s_lock);
1288
1272 ext2_write_super(sb); 1289 ext2_write_super(sb);
1290
1291 dquot_resume(sb, -1);
1273 } 1292 }
1293
1274 return 0; 1294 return 0;
1275restore_opts: 1295restore_opts:
1276 sbi->s_mount_opt = old_opts.s_mount_opt; 1296 sbi->s_mount_opt = old_opts.s_mount_opt;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
297 kfree (old); 297 kfree (old);
298 } 298 }
299 if (!parent) 299 if (!parent)
300 root->rb_node = NULL; 300 *root = RB_ROOT;
301 else if (parent->rb_left == n) 301 else if (parent->rb_left == n)
302 parent->rb_left = NULL; 302 parent->rb_left = NULL;
303 else if (parent->rb_right == n) 303 else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index fcf7487734b6..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,9 +43,9 @@
43 * inode to disk. 43 * inode to disk.
44 */ 44 */
45 45
46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext3_sync_file(struct file *file, int datasync)
47{ 47{
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = file->f_mapping->host;
49 struct ext3_inode_info *ei = EXT3_I(inode); 49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
51 int ret, needs_barrier = 0; 51 int ret, needs_barrier = 0;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0fc1293d0e96..6c953bb255e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
410 struct ext3_super_block *es = sbi->s_es; 410 struct ext3_super_block *es = sbi->s_es;
411 int i, err; 411 int i, err;
412 412
413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
414
413 lock_kernel(); 415 lock_kernel();
414 416
415 ext3_xattr_put_super(sb); 417 ext3_xattr_put_super(sb);
@@ -748,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot);
748static int ext3_mark_dquot_dirty(struct dquot *dquot); 750static int ext3_mark_dquot_dirty(struct dquot *dquot);
749static int ext3_write_info(struct super_block *sb, int type); 751static int ext3_write_info(struct super_block *sb, int type);
750static int ext3_quota_on(struct super_block *sb, int type, int format_id, 752static int ext3_quota_on(struct super_block *sb, int type, int format_id,
751 char *path, int remount); 753 char *path);
752static int ext3_quota_on_mount(struct super_block *sb, int type); 754static int ext3_quota_on_mount(struct super_block *sb, int type);
753static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 755static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
754 size_t len, loff_t off); 756 size_t len, loff_t off);
@@ -767,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = {
767 769
768static const struct quotactl_ops ext3_qctl_operations = { 770static const struct quotactl_ops ext3_qctl_operations = {
769 .quota_on = ext3_quota_on, 771 .quota_on = ext3_quota_on,
770 .quota_off = vfs_quota_off, 772 .quota_off = dquot_quota_off,
771 .quota_sync = vfs_quota_sync, 773 .quota_sync = dquot_quota_sync,
772 .get_info = vfs_get_dqinfo, 774 .get_info = dquot_get_dqinfo,
773 .set_info = vfs_set_dqinfo, 775 .set_info = dquot_set_dqinfo,
774 .get_dqblk = vfs_get_dqblk, 776 .get_dqblk = dquot_get_dqblk,
775 .set_dqblk = vfs_set_dqblk 777 .set_dqblk = dquot_set_dqblk
776}; 778};
777#endif 779#endif
778 780
@@ -1527,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1527 /* Turn quotas off */ 1529 /* Turn quotas off */
1528 for (i = 0; i < MAXQUOTAS; i++) { 1530 for (i = 0; i < MAXQUOTAS; i++) {
1529 if (sb_dqopt(sb)->files[i]) 1531 if (sb_dqopt(sb)->files[i])
1530 vfs_quota_off(sb, i, 0); 1532 dquot_quota_off(sb, i);
1531 } 1533 }
1532#endif 1534#endif
1533 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 1535 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2551,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2551 ext3_fsblk_t n_blocks_count = 0; 2553 ext3_fsblk_t n_blocks_count = 0;
2552 unsigned long old_sb_flags; 2554 unsigned long old_sb_flags;
2553 struct ext3_mount_options old_opts; 2555 struct ext3_mount_options old_opts;
2556 int enable_quota = 0;
2554 int err; 2557 int err;
2555#ifdef CONFIG_QUOTA 2558#ifdef CONFIG_QUOTA
2556 int i; 2559 int i;
@@ -2597,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2597 } 2600 }
2598 2601
2599 if (*flags & MS_RDONLY) { 2602 if (*flags & MS_RDONLY) {
2603 err = dquot_suspend(sb, -1);
2604 if (err < 0)
2605 goto restore_opts;
2606
2600 /* 2607 /*
2601 * First of all, the unconditional stuff we have to do 2608 * First of all, the unconditional stuff we have to do
2602 * to disable replay of the journal when we next remount 2609 * to disable replay of the journal when we next remount
@@ -2651,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2651 goto restore_opts; 2658 goto restore_opts;
2652 if (!ext3_setup_super (sb, es, 0)) 2659 if (!ext3_setup_super (sb, es, 0))
2653 sb->s_flags &= ~MS_RDONLY; 2660 sb->s_flags &= ~MS_RDONLY;
2661 enable_quota = 1;
2654 } 2662 }
2655 } 2663 }
2656#ifdef CONFIG_QUOTA 2664#ifdef CONFIG_QUOTA
@@ -2662,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2662#endif 2670#endif
2663 unlock_super(sb); 2671 unlock_super(sb);
2664 unlock_kernel(); 2672 unlock_kernel();
2673
2674 if (enable_quota)
2675 dquot_resume(sb, -1);
2665 return 0; 2676 return 0;
2666restore_opts: 2677restore_opts:
2667 sb->s_flags = old_sb_flags; 2678 sb->s_flags = old_sb_flags;
@@ -2851,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type)
2851 */ 2862 */
2852static int ext3_quota_on_mount(struct super_block *sb, int type) 2863static int ext3_quota_on_mount(struct super_block *sb, int type)
2853{ 2864{
2854 return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], 2865 return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
2855 EXT3_SB(sb)->s_jquota_fmt, type); 2866 EXT3_SB(sb)->s_jquota_fmt, type);
2856} 2867}
2857 2868
2858/* 2869/*
2859 * Standard function to be called on quota_on 2870 * Standard function to be called on quota_on
2860 */ 2871 */
2861static int ext3_quota_on(struct super_block *sb, int type, int format_id, 2872static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2862 char *name, int remount) 2873 char *name)
2863{ 2874{
2864 int err; 2875 int err;
2865 struct path path; 2876 struct path path;
2866 2877
2867 if (!test_opt(sb, QUOTA)) 2878 if (!test_opt(sb, QUOTA))
2868 return -EINVAL; 2879 return -EINVAL;
2869 /* When remounting, no checks are needed and in fact, name is NULL */
2870 if (remount)
2871 return vfs_quota_on(sb, type, format_id, name, remount);
2872 2880
2873 err = kern_path(name, LOOKUP_FOLLOW, &path); 2881 err = kern_path(name, LOOKUP_FOLLOW, &path);
2874 if (err) 2882 if (err)
@@ -2906,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2906 } 2914 }
2907 } 2915 }
2908 2916
2909 err = vfs_quota_on_path(sb, type, format_id, &path); 2917 err = dquot_quota_on_path(sb, type, format_id, &path);
2910 path_put(&path); 2918 path_put(&path);
2911 return err; 2919 return err;
2912} 2920}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 591 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 592 if (count)
593 *count = ar.len; 593 *count = ar.len;
594
595 /* 594 /*
596 * Account for the allocated meta blocks 595 * Account for the allocated meta blocks. We will never
596 * fail EDQUOT for metdata, but we do account for it.
597 */ 597 */
598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
602 dquot_alloc_block_nofail(inode, ar.len);
602 } 603 }
603 return ret; 604 return ret;
604} 605}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
72 else if (start_blk >= (entry->start_blk + entry->count)) 72 else if (start_blk >= (entry->start_blk + entry->count))
73 n = &(*n)->rb_right; 73 n = &(*n)->rb_right;
74 else { 74 else {
75 if (start_blk + count > (entry->start_blk + 75 if (start_blk + count > (entry->start_blk +
76 entry->count)) 76 entry->count))
77 entry->count = (start_blk + count - 77 entry->count = (start_blk + count -
78 entry->start_blk); 78 entry->start_blk);
79 new_node = *n; 79 new_node = *n;
80 new_entry = rb_entry(new_node, struct ext4_system_zone, 80 new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 __ext4_error(dir->i_sb, function, 86 ext4_error_inode(function, dir,
87 "bad entry in directory #%lu: %s - block=%llu" 87 "bad entry in directory: %s - block=%llu"
88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, 89 error_msg, (unsigned long long) bh->b_blocknr,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset, 90 (unsigned) (offset%bh->b_size), offset,
92 le32_to_cpu(de->inode), 91 le32_to_cpu(de->inode),
93 rlen, de->name_len); 92 rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
111 110
112 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 111 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT4_FEATURE_COMPAT_DIR_INDEX) && 112 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || 113 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) { 114 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext4_dx_readdir(filp, dirent, filldir); 115 err = ext4_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) { 116 if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
122 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
124 */ 123 */
125 EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; 124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
126 } 125 }
127 stored = 0; 126 stored = 0;
128 offset = filp->f_pos & (sb->s_blocksize - 1); 127 offset = filp->f_pos & (sb->s_blocksize - 1);
129 128
130 while (!error && !stored && filp->f_pos < inode->i_size) { 129 while (!error && !stored && filp->f_pos < inode->i_size) {
131 ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 130 struct ext4_map_blocks map;
132 struct buffer_head map_bh;
133 struct buffer_head *bh = NULL; 131 struct buffer_head *bh = NULL;
134 132
135 map_bh.b_state = 0; 133 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
136 err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); 134 map.m_len = 1;
135 err = ext4_map_blocks(NULL, inode, &map, 0);
137 if (err > 0) { 136 if (err > 0) {
138 pgoff_t index = map_bh.b_blocknr >> 137 pgoff_t index = map.m_pblk >>
139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 138 (PAGE_CACHE_SHIFT - inode->i_blkbits);
140 if (!ra_has_index(&filp->f_ra, index)) 139 if (!ra_has_index(&filp->f_ra, index))
141 page_cache_sync_readahead( 140 page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
143 &filp->f_ra, filp, 142 &filp->f_ra, filp,
144 index, 1); 143 index, 1);
145 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 144 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
146 bh = ext4_bread(NULL, inode, blk, 0, &err); 145 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
147 } 146 }
148 147
149 /* 148 /*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
152 */ 151 */
153 if (!bh) { 152 if (!bh) {
154 if (!dir_has_error) { 153 if (!dir_has_error) {
155 ext4_error(sb, "directory #%lu " 154 EXT4_ERROR_INODE(inode, "directory "
156 "contains a hole at offset %Lu", 155 "contains a hole at offset %Lu",
157 inode->i_ino,
158 (unsigned long long) filp->f_pos); 156 (unsigned long long) filp->f_pos);
159 dir_has_error = 1; 157 dir_has_error = 1;
160 } 158 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..19a4de57128a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/blockgroup_lock.h> 30#include <linux/blockgroup_lock.h>
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#ifdef __KERNEL__
33#include <linux/compat.h>
34#endif
32 35
33/* 36/*
34 * The fourth extended filesystem constants/structures 37 * The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
54#endif 57#endif
55 58
56#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a); 60 ext4_error_inode(__func__, (inode), (fmt), ## a)
58 61
59#define EXT4_ERROR_FILE(file, fmt, a...) \ 62#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a); 63 ext4_error_file(__func__, (file), (fmt), ## a)
61 64
62/* data type for block offset of block group */ 65/* data type for block offset of block group */
63typedef int ext4_grpblk_t; 66typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
72typedef unsigned int ext4_group_t; 75typedef unsigned int ext4_group_t;
73 76
74/* 77/*
75 * Flags used in mballoc's allocation_context flags field. 78 * Flags used in mballoc's allocation_context flags field.
76 * 79 *
77 * Also used to show what's going on for debugging purposes when the 80 * Also used to show what's going on for debugging purposes when the
78 * flag field is exported via the traceport interface 81 * flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
126}; 129};
127 130
128/* 131/*
132 * Logical to physical block mapping, used by ext4_map_blocks()
133 *
134 * This structure is used to pass requests into ext4_map_blocks() as
135 * well as to store the information returned by ext4_map_blocks(). It
136 * takes less room on the stack than a struct buffer_head.
137 */
138#define EXT4_MAP_NEW (1 << BH_New)
139#define EXT4_MAP_MAPPED (1 << BH_Mapped)
140#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
141#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
142#define EXT4_MAP_UNINIT (1 << BH_Uninit)
143#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
144 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
145 EXT4_MAP_UNINIT)
146
147struct ext4_map_blocks {
148 ext4_fsblk_t m_pblk;
149 ext4_lblk_t m_lblk;
150 unsigned int m_len;
151 unsigned int m_flags;
152};
153
154/*
129 * For delayed allocation tracking 155 * For delayed allocation tracking
130 */ 156 */
131struct mpage_da_data { 157struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
321 return flags & EXT4_OTHER_FLMASK; 347 return flags & EXT4_OTHER_FLMASK;
322} 348}
323 349
350/*
351 * Inode flags used for atomic set/get
352 */
353enum {
354 EXT4_INODE_SECRM = 0, /* Secure deletion */
355 EXT4_INODE_UNRM = 1, /* Undelete */
356 EXT4_INODE_COMPR = 2, /* Compress file */
357 EXT4_INODE_SYNC = 3, /* Synchronous updates */
358 EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
359 EXT4_INODE_APPEND = 5, /* writes to file may only append */
360 EXT4_INODE_NODUMP = 6, /* do not dump file */
361 EXT4_INODE_NOATIME = 7, /* do not update atime */
362/* Reserved for compression usage... */
363 EXT4_INODE_DIRTY = 8,
364 EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
365 EXT4_INODE_NOCOMPR = 10, /* Don't compress */
366 EXT4_INODE_ECOMPR = 11, /* Compression error */
367/* End compression flags --- maybe not all used */
368 EXT4_INODE_INDEX = 12, /* hash-indexed directory */
369 EXT4_INODE_IMAGIC = 13, /* AFS directory */
370 EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
371 EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
372 EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
373 EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
374 EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
375 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
376 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
377 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
378 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
379};
380
381#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
382#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
383 printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
384 EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
385
386/*
387 * Since it's pretty easy to mix up bit numbers and hex values, and we
388 * can't do a compile-time test for ENUM values, we use a run-time
389 * test to make sure that EXT4_XXX_FL is consistent with respect to
390 * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
391 * out so it won't cost any extra space in the compiled kernel image.
392 * But it's important that these values are the same, since we are
393 * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
394 * must be consistent with the values of FS_XXX_FL defined in
395 * include/linux/fs.h and the on-disk values found in ext2, ext3, and
396 * ext4 filesystems, and of course the values defined in e2fsprogs.
397 *
398 * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
399 */
400static inline void ext4_check_flag_values(void)
401{
402 CHECK_FLAG_VALUE(SECRM);
403 CHECK_FLAG_VALUE(UNRM);
404 CHECK_FLAG_VALUE(COMPR);
405 CHECK_FLAG_VALUE(SYNC);
406 CHECK_FLAG_VALUE(IMMUTABLE);
407 CHECK_FLAG_VALUE(APPEND);
408 CHECK_FLAG_VALUE(NODUMP);
409 CHECK_FLAG_VALUE(NOATIME);
410 CHECK_FLAG_VALUE(DIRTY);
411 CHECK_FLAG_VALUE(COMPRBLK);
412 CHECK_FLAG_VALUE(NOCOMPR);
413 CHECK_FLAG_VALUE(ECOMPR);
414 CHECK_FLAG_VALUE(INDEX);
415 CHECK_FLAG_VALUE(IMAGIC);
416 CHECK_FLAG_VALUE(JOURNAL_DATA);
417 CHECK_FLAG_VALUE(NOTAIL);
418 CHECK_FLAG_VALUE(DIRSYNC);
419 CHECK_FLAG_VALUE(TOPDIR);
420 CHECK_FLAG_VALUE(HUGE_FILE);
421 CHECK_FLAG_VALUE(EXTENTS);
422 CHECK_FLAG_VALUE(EA_INODE);
423 CHECK_FLAG_VALUE(EOFBLOCKS);
424 CHECK_FLAG_VALUE(RESERVED);
425}
426
324/* Used to pass group descriptor data when online resize is done */ 427/* Used to pass group descriptor data when online resize is done */
325struct ext4_new_group_input { 428struct ext4_new_group_input {
326 __u32 group; /* Group number for this data */ 429 __u32 group; /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
332 __u16 unused; 435 __u16 unused;
333}; 436};
334 437
438#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
439struct compat_ext4_new_group_input {
440 u32 group;
441 compat_u64 block_bitmap;
442 compat_u64 inode_bitmap;
443 compat_u64 inode_table;
444 u32 blocks_count;
445 u16 reserved_blocks;
446 u16 unused;
447};
448#endif
449
335/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ 450/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
336struct ext4_new_group_data { 451struct ext4_new_group_data {
337 __u32 group; 452 __u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
355#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ 470#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
356 EXT4_GET_BLOCKS_CREATE) 471 EXT4_GET_BLOCKS_CREATE)
357 /* Caller is from the delayed allocation writeout path, 472 /* Caller is from the delayed allocation writeout path,
358 so set the magic i_delalloc_reserve_flag after taking the 473 so set the magic i_delalloc_reserve_flag after taking the
359 inode allocation semaphore for */ 474 inode allocation semaphore for */
360#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 475#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
361 /* caller is from the direct IO path, request to creation of an 476 /* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
398#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 513#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
399#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 514#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
400 515
516#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
401/* 517/*
402 * ioctl commands in 32 bit emulation 518 * ioctl commands in 32 bit emulation
403 */ 519 */
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
408#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) 524#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
409#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) 525#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
410#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) 526#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
527#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
411#ifdef CONFIG_JBD2_DEBUG 528#ifdef CONFIG_JBD2_DEBUG
412#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) 529#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
413#endif 530#endif
414#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 531#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
415#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 532#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
533#endif
416 534
417 535
418/* 536/*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
616 */ 734 */
617struct ext4_inode_info { 735struct ext4_inode_info {
618 __le32 i_data[15]; /* unconverted */ 736 __le32 i_data[15]; /* unconverted */
619 __u32 i_flags;
620 ext4_fsblk_t i_file_acl;
621 __u32 i_dtime; 737 __u32 i_dtime;
738 ext4_fsblk_t i_file_acl;
622 739
623 /* 740 /*
624 * i_block_group is the number of the block group which contains 741 * i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
629 */ 746 */
630 ext4_group_t i_block_group; 747 ext4_group_t i_block_group;
631 unsigned long i_state_flags; /* Dynamic state flags */ 748 unsigned long i_state_flags; /* Dynamic state flags */
749 unsigned long i_flags;
632 750
633 ext4_lblk_t i_dir_start_lookup; 751 ext4_lblk_t i_dir_start_lookup;
634#ifdef CONFIG_EXT4_FS_XATTR 752#ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ 1180 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1181 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1182 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1183 EXT4_STATE_NEWENTRY, /* File just added to dir */
1065}; 1184};
1066 1185
1067static inline int ext4_test_inode_state(struct inode *inode, int bit) 1186#define EXT4_INODE_BIT_FNS(name, field) \
1068{ 1187static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags); 1188{ \
1070} 1189 return test_bit(bit, &EXT4_I(inode)->i_##field); \
1071 1190} \
1072static inline void ext4_set_inode_state(struct inode *inode, int bit) 1191static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1073{ 1192{ \
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags); 1193 set_bit(bit, &EXT4_I(inode)->i_##field); \
1194} \
1195static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1196{ \
1197 clear_bit(bit, &EXT4_I(inode)->i_##field); \
1075} 1198}
1076 1199
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit) 1200EXT4_INODE_BIT_FNS(flag, flags)
1078{ 1201EXT4_INODE_BIT_FNS(state, state_flags)
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1081#else 1202#else
1082/* Assume that user mode programs are passing in an ext4fs superblock, not 1203/* Assume that user mode programs are passing in an ext4fs superblock, not
1083 * a kernel struct super_block. This will allow us to call the feature-test 1204 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
1264 1385
1265#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ 1386#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
1266 EXT4_FEATURE_COMPAT_DIR_INDEX) && \ 1387 EXT4_FEATURE_COMPAT_DIR_INDEX) && \
1267 (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) 1388 ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
1268#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) 1389#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
1269#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) 1390#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
1270 1391
@@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1398extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1519extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1399 1520
1400/* fsync.c */ 1521/* fsync.c */
1401extern int ext4_sync_file(struct file *, struct dentry *, int); 1522extern int ext4_sync_file(struct file *, int);
1402 1523
1403/* hash.c */ 1524/* hash.c */
1404extern int ext4fs_dirhash(const char *name, int len, struct 1525extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
1678 ext4_grpblk_t bb_first_free; /* first free block */ 1799 ext4_grpblk_t bb_first_free; /* first free block */
1679 ext4_grpblk_t bb_free; /* total free blocks */ 1800 ext4_grpblk_t bb_free; /* total free blocks */
1680 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 1801 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1802 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
1681 struct list_head bb_prealloc_list; 1803 struct list_head bb_prealloc_list;
1682#ifdef DOUBLE_CHECK 1804#ifdef DOUBLE_CHECK
1683 void *bb_bitmap; 1805 void *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1772extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1894extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1773extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1895extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1774 int chunk); 1896 int chunk);
1775extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1897extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
1776 ext4_lblk_t iblock, unsigned int max_blocks, 1898 struct ext4_map_blocks *map, int flags);
1777 struct buffer_head *bh_result, int flags);
1778extern void ext4_ext_truncate(struct inode *); 1899extern void ext4_ext_truncate(struct inode *);
1779extern void ext4_ext_init(struct super_block *); 1900extern void ext4_ext_init(struct super_block *);
1780extern void ext4_ext_release(struct super_block *); 1901extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1782 loff_t len); 1903 loff_t len);
1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1904extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1784 ssize_t len); 1905 ssize_t len);
1906extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
1907 struct ext4_map_blocks *map, int flags);
1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1908extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1786 sector_t block, unsigned int max_blocks, 1909 sector_t block, unsigned int max_blocks,
1787 struct buffer_head *bh, int flags); 1910 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
273 return 1; 273 return 1;
274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
275 return 1; 275 return 1;
276 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 276 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
277 return 1; 277 return 1;
278 return 0; 278 return 0;
279} 279}
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
284 return 0; 284 return 0;
285 if (!S_ISREG(inode->i_mode)) 285 if (!S_ISREG(inode->i_mode))
286 return 0; 286 return 0;
287 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 287 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
288 return 0; 288 return 0;
289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
290 return 1; 290 return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
297 return 0; 297 return 0;
298 if (EXT4_JOURNAL(inode) == NULL) 298 if (EXT4_JOURNAL(inode) == NULL)
299 return 1; 299 return 1;
300 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 300 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
301 return 0; 301 return 0;
302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
303 return 1; 303 return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
321 return 0; 321 return 0;
322 if (!S_ISREG(inode->i_mode)) 322 if (!S_ISREG(inode->i_mode))
323 return 0; 323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 324 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
325 return 0; 325 return 0;
326 if (ext4_should_journal_data(inode)) 326 if (ext4_should_journal_data(inode))
327 return 0; 327 return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca8..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
107 if (err <= 0) 107 if (err <= 0)
108 return err; 108 return err;
109 err = ext4_truncate_restart_trans(handle, inode, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /* 110 if (err == 0)
111 * We have dropped i_data_sem so someone might have cached again 111 err = -EAGAIN;
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115 112
116 return err; 113 return err;
117} 114}
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
185 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 182 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
186 /* 183 /*
187 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 184 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
188 * block groups per flexgroup, reserve the first block 185 * block groups per flexgroup, reserve the first block
189 * group for directories and special files. Regular 186 * group for directories and special files. Regular
190 * files will start at the second block group. This 187 * files will start at the second block group. This
191 * tends to speed up directory access and improves 188 * tends to speed up directory access and improves
192 * fsck times. 189 * fsck times.
193 */ 190 */
194 block_group &= ~(flex_size-1); 191 block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
439 return 0; 436 return 0;
440 437
441corrupted: 438corrupted:
442 __ext4_error(inode->i_sb, function, 439 ext4_error_inode(function, inode,
443 "bad header/extent in inode #%lu: %s - magic %x, " 440 "bad header/extent: %s - magic %x, "
444 "entries %u, max %u(%u), depth %u(%u)", 441 "entries %u, max %u(%u), depth %u(%u)",
445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 442 error_msg, le16_to_cpu(eh->eh_magic),
446 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 443 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
447 max, le16_to_cpu(eh->eh_depth), depth); 444 max, le16_to_cpu(eh->eh_depth), depth);
448 445
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
1622 merge_done = 1; 1619 merge_done = 1;
1623 WARN_ON(eh->eh_entries == 0); 1620 WARN_ON(eh->eh_entries == 0);
1624 if (!eh->eh_entries) 1621 if (!eh->eh_entries)
1625 ext4_error(inode->i_sb, 1622 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1628 } 1623 }
1629 1624
1630 return merge_done; 1625 return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2039 struct ext4_ext_cache *cex; 2034 struct ext4_ext_cache *cex;
2040 int ret = EXT4_EXT_CACHE_NO; 2035 int ret = EXT4_EXT_CACHE_NO;
2041 2036
2042 /* 2037 /*
2043 * We borrow i_block_reservation_lock to protect i_cached_extent 2038 * We borrow i_block_reservation_lock to protect i_cached_extent
2044 */ 2039 */
2045 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2040 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2361 int depth = ext_depth(inode); 2356 int depth = ext_depth(inode);
2362 struct ext4_ext_path *path; 2357 struct ext4_ext_path *path;
2363 handle_t *handle; 2358 handle_t *handle;
2364 int i = 0, err = 0; 2359 int i, err;
2365 2360
2366 ext_debug("truncate since %u\n", start); 2361 ext_debug("truncate since %u\n", start);
2367 2362
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2370 if (IS_ERR(handle)) 2365 if (IS_ERR(handle))
2371 return PTR_ERR(handle); 2366 return PTR_ERR(handle);
2372 2367
2368again:
2373 ext4_ext_invalidate_cache(inode); 2369 ext4_ext_invalidate_cache(inode);
2374 2370
2375 /* 2371 /*
2376 * We start scanning from right side, freeing all the blocks 2372 * We start scanning from right side, freeing all the blocks
2377 * after i_size and walking into the tree depth-wise. 2373 * after i_size and walking into the tree depth-wise.
2378 */ 2374 */
2375 depth = ext_depth(inode);
2379 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2376 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2380 if (path == NULL) { 2377 if (path == NULL) {
2381 ext4_journal_stop(handle); 2378 ext4_journal_stop(handle);
2382 return -ENOMEM; 2379 return -ENOMEM;
2383 } 2380 }
2381 path[0].p_depth = depth;
2384 path[0].p_hdr = ext_inode_hdr(inode); 2382 path[0].p_hdr = ext_inode_hdr(inode);
2385 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2383 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2386 err = -EIO; 2384 err = -EIO;
2387 goto out; 2385 goto out;
2388 } 2386 }
2389 path[0].p_depth = depth; 2387 i = err = 0;
2390 2388
2391 while (i >= 0 && err == 0) { 2389 while (i >= 0 && err == 0) {
2392 if (i == depth) { 2390 if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2480out: 2478out:
2481 ext4_ext_drop_refs(path); 2479 ext4_ext_drop_refs(path);
2482 kfree(path); 2480 kfree(path);
2481 if (err == -EAGAIN)
2482 goto again;
2483 ext4_journal_stop(handle); 2483 ext4_journal_stop(handle);
2484 2484
2485 return err; 2485 return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
2544/* FIXME!! we need to try to merge to left or right after zero-out */ 2544/* FIXME!! we need to try to merge to left or right after zero-out */
2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2546{ 2546{
2547 int ret = -EIO; 2547 int ret;
2548 struct bio *bio; 2548 struct bio *bio;
2549 int blkbits, blocksize; 2549 int blkbits, blocksize;
2550 sector_t ee_pblock; 2550 sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2568 len = ee_len; 2568 len = ee_len;
2569 2569
2570 bio = bio_alloc(GFP_NOIO, len); 2570 bio = bio_alloc(GFP_NOIO, len);
2571 if (!bio)
2572 return -ENOMEM;
2573
2571 bio->bi_sector = ee_pblock; 2574 bio->bi_sector = ee_pblock;
2572 bio->bi_bdev = inode->i_sb->s_bdev; 2575 bio->bi_bdev = inode->i_sb->s_bdev;
2573 2576
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2595 submit_bio(WRITE, bio); 2598 submit_bio(WRITE, bio);
2596 wait_for_completion(&event); 2599 wait_for_completion(&event);
2597 2600
2598 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 2601 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2599 ret = 0; 2602 bio_put(bio);
2600 else { 2603 return -EIO;
2601 ret = -EIO;
2602 break;
2603 } 2604 }
2604 bio_put(bio); 2605 bio_put(bio);
2605 ee_len -= done; 2606 ee_len -= done;
2606 ee_pblock += done << (blkbits - 9); 2607 ee_pblock += done << (blkbits - 9);
2607 } 2608 }
2608 return ret; 2609 return 0;
2609} 2610}
2610 2611
2611#define EXT4_EXT_ZERO_LEN 7 2612#define EXT4_EXT_ZERO_LEN 7
2612/* 2613/*
2613 * This function is called by ext4_ext_get_blocks() if someone tries to write 2614 * This function is called by ext4_ext_map_blocks() if someone tries to write
2614 * to an uninitialized extent. It may result in splitting the uninitialized 2615 * to an uninitialized extent. It may result in splitting the uninitialized
2615 * extent into multiple extents (upto three - one initialized and two 2616 * extent into multiple extents (upto three - one initialized and two
2616 * uninitialized). 2617 * uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2620 * c> Splits in three extents: Somone is writing in middle of the extent 2621 * c> Splits in three extents: Somone is writing in middle of the extent
2621 */ 2622 */
2622static int ext4_ext_convert_to_initialized(handle_t *handle, 2623static int ext4_ext_convert_to_initialized(handle_t *handle,
2623 struct inode *inode, 2624 struct inode *inode,
2624 struct ext4_ext_path *path, 2625 struct ext4_map_blocks *map,
2625 ext4_lblk_t iblock, 2626 struct ext4_ext_path *path)
2626 unsigned int max_blocks)
2627{ 2627{
2628 struct ext4_extent *ex, newex, orig_ex; 2628 struct ext4_extent *ex, newex, orig_ex;
2629 struct ext4_extent *ex1 = NULL; 2629 struct ext4_extent *ex1 = NULL;
2630 struct ext4_extent *ex2 = NULL; 2630 struct ext4_extent *ex2 = NULL;
2631 struct ext4_extent *ex3 = NULL; 2631 struct ext4_extent *ex3 = NULL;
2632 struct ext4_extent_header *eh; 2632 struct ext4_extent_header *eh;
2633 ext4_lblk_t ee_block; 2633 ext4_lblk_t ee_block, eof_block;
2634 unsigned int allocated, ee_len, depth; 2634 unsigned int allocated, ee_len, depth;
2635 ext4_fsblk_t newblock; 2635 ext4_fsblk_t newblock;
2636 int err = 0; 2636 int err = 0;
2637 int ret = 0; 2637 int ret = 0;
2638 int may_zeroout;
2639
2640 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2641 "block %llu, max_blocks %u\n", inode->i_ino,
2642 (unsigned long long)map->m_lblk, map->m_len);
2643
2644 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2645 inode->i_sb->s_blocksize_bits;
2646 if (eof_block < map->m_lblk + map->m_len)
2647 eof_block = map->m_lblk + map->m_len;
2638 2648
2639 depth = ext_depth(inode); 2649 depth = ext_depth(inode);
2640 eh = path[depth].p_hdr; 2650 eh = path[depth].p_hdr;
2641 ex = path[depth].p_ext; 2651 ex = path[depth].p_ext;
2642 ee_block = le32_to_cpu(ex->ee_block); 2652 ee_block = le32_to_cpu(ex->ee_block);
2643 ee_len = ext4_ext_get_actual_len(ex); 2653 ee_len = ext4_ext_get_actual_len(ex);
2644 allocated = ee_len - (iblock - ee_block); 2654 allocated = ee_len - (map->m_lblk - ee_block);
2645 newblock = iblock - ee_block + ext_pblock(ex); 2655 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2656
2646 ex2 = ex; 2657 ex2 = ex;
2647 orig_ex.ee_block = ex->ee_block; 2658 orig_ex.ee_block = ex->ee_block;
2648 orig_ex.ee_len = cpu_to_le16(ee_len); 2659 orig_ex.ee_len = cpu_to_le16(ee_len);
2649 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2660 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2650 2661
2662 /*
2663 * It is safe to convert extent to initialized via explicit
2664 * zeroout only if extent is fully insde i_size or new_size.
2665 */
2666 may_zeroout = ee_block + ee_len <= eof_block;
2667
2651 err = ext4_ext_get_access(handle, inode, path + depth); 2668 err = ext4_ext_get_access(handle, inode, path + depth);
2652 if (err) 2669 if (err)
2653 goto out; 2670 goto out;
2654 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2671 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2655 if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { 2672 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2656 err = ext4_ext_zeroout(inode, &orig_ex); 2673 err = ext4_ext_zeroout(inode, &orig_ex);
2657 if (err) 2674 if (err)
2658 goto fix_extent_len; 2675 goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2665 return allocated; 2682 return allocated;
2666 } 2683 }
2667 2684
2668 /* ex1: ee_block to iblock - 1 : uninitialized */ 2685 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2669 if (iblock > ee_block) { 2686 if (map->m_lblk > ee_block) {
2670 ex1 = ex; 2687 ex1 = ex;
2671 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2688 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2672 ext4_ext_mark_uninitialized(ex1); 2689 ext4_ext_mark_uninitialized(ex1);
2673 ex2 = &newex; 2690 ex2 = &newex;
2674 } 2691 }
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2677 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2694 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2678 * overlap of blocks. 2695 * overlap of blocks.
2679 */ 2696 */
2680 if (!ex1 && allocated > max_blocks) 2697 if (!ex1 && allocated > map->m_len)
2681 ex2->ee_len = cpu_to_le16(max_blocks); 2698 ex2->ee_len = cpu_to_le16(map->m_len);
2682 /* ex3: to ee_block + ee_len : uninitialised */ 2699 /* ex3: to ee_block + ee_len : uninitialised */
2683 if (allocated > max_blocks) { 2700 if (allocated > map->m_len) {
2684 unsigned int newdepth; 2701 unsigned int newdepth;
2685 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2702 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2686 if (allocated <= EXT4_EXT_ZERO_LEN) { 2703 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2687 /* 2704 /*
2688 * iblock == ee_block is handled by the zerouout 2705 * map->m_lblk == ee_block is handled by the zerouout
2689 * at the beginning. 2706 * at the beginning.
2690 * Mark first half uninitialized. 2707 * Mark first half uninitialized.
2691 * Mark second half initialized and zero out the 2708 * Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2698 ext4_ext_dirty(handle, inode, path + depth); 2715 ext4_ext_dirty(handle, inode, path + depth);
2699 2716
2700 ex3 = &newex; 2717 ex3 = &newex;
2701 ex3->ee_block = cpu_to_le32(iblock); 2718 ex3->ee_block = cpu_to_le32(map->m_lblk);
2702 ext4_ext_store_pblock(ex3, newblock); 2719 ext4_ext_store_pblock(ex3, newblock);
2703 ex3->ee_len = cpu_to_le16(allocated); 2720 ex3->ee_len = cpu_to_le16(allocated);
2704 err = ext4_ext_insert_extent(handle, inode, path, 2721 err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2711 ex->ee_len = orig_ex.ee_len; 2728 ex->ee_len = orig_ex.ee_len;
2712 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2729 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2713 ext4_ext_dirty(handle, inode, path + depth); 2730 ext4_ext_dirty(handle, inode, path + depth);
2714 /* blocks available from iblock */ 2731 /* blocks available from map->m_lblk */
2715 return allocated; 2732 return allocated;
2716 2733
2717 } else if (err) 2734 } else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2733 */ 2750 */
2734 depth = ext_depth(inode); 2751 depth = ext_depth(inode);
2735 ext4_ext_drop_refs(path); 2752 ext4_ext_drop_refs(path);
2736 path = ext4_ext_find_extent(inode, 2753 path = ext4_ext_find_extent(inode, map->m_lblk,
2737 iblock, path); 2754 path);
2738 if (IS_ERR(path)) { 2755 if (IS_ERR(path)) {
2739 err = PTR_ERR(path); 2756 err = PTR_ERR(path);
2740 return err; 2757 return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2754 return allocated; 2771 return allocated;
2755 } 2772 }
2756 ex3 = &newex; 2773 ex3 = &newex;
2757 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 2774 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2758 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2775 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2759 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2776 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2760 ext4_ext_mark_uninitialized(ex3); 2777 ext4_ext_mark_uninitialized(ex3);
2761 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2778 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2762 if (err == -ENOSPC) { 2779 if (err == -ENOSPC && may_zeroout) {
2763 err = ext4_ext_zeroout(inode, &orig_ex); 2780 err = ext4_ext_zeroout(inode, &orig_ex);
2764 if (err) 2781 if (err)
2765 goto fix_extent_len; 2782 goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2769 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2786 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2770 ext4_ext_dirty(handle, inode, path + depth); 2787 ext4_ext_dirty(handle, inode, path + depth);
2771 /* zeroed the full extent */ 2788 /* zeroed the full extent */
2772 /* blocks available from iblock */ 2789 /* blocks available from map->m_lblk */
2773 return allocated; 2790 return allocated;
2774 2791
2775 } else if (err) 2792 } else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2783 * update the extent length after successful insert of the 2800 * update the extent length after successful insert of the
2784 * split extent 2801 * split extent
2785 */ 2802 */
2786 orig_ex.ee_len = cpu_to_le16(ee_len - 2803 ee_len -= ext4_ext_get_actual_len(ex3);
2787 ext4_ext_get_actual_len(ex3)); 2804 orig_ex.ee_len = cpu_to_le16(ee_len);
2805 may_zeroout = ee_block + ee_len <= eof_block;
2806
2788 depth = newdepth; 2807 depth = newdepth;
2789 ext4_ext_drop_refs(path); 2808 ext4_ext_drop_refs(path);
2790 path = ext4_ext_find_extent(inode, iblock, path); 2809 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2791 if (IS_ERR(path)) { 2810 if (IS_ERR(path)) {
2792 err = PTR_ERR(path); 2811 err = PTR_ERR(path);
2793 goto out; 2812 goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2801 if (err) 2820 if (err)
2802 goto out; 2821 goto out;
2803 2822
2804 allocated = max_blocks; 2823 allocated = map->m_len;
2805 2824
2806 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2825 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2807 * to insert a extent in the middle zerout directly 2826 * to insert a extent in the middle zerout directly
2808 * otherwise give the extent a chance to merge to left 2827 * otherwise give the extent a chance to merge to left
2809 */ 2828 */
2810 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2829 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2811 iblock != ee_block) { 2830 map->m_lblk != ee_block && may_zeroout) {
2812 err = ext4_ext_zeroout(inode, &orig_ex); 2831 err = ext4_ext_zeroout(inode, &orig_ex);
2813 if (err) 2832 if (err)
2814 goto fix_extent_len; 2833 goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2818 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2837 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2819 ext4_ext_dirty(handle, inode, path + depth); 2838 ext4_ext_dirty(handle, inode, path + depth);
2820 /* zero out the first half */ 2839 /* zero out the first half */
2821 /* blocks available from iblock */ 2840 /* blocks available from map->m_lblk */
2822 return allocated; 2841 return allocated;
2823 } 2842 }
2824 } 2843 }
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2829 */ 2848 */
2830 if (ex1 && ex1 != ex) { 2849 if (ex1 && ex1 != ex) {
2831 ex1 = ex; 2850 ex1 = ex;
2832 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2851 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2833 ext4_ext_mark_uninitialized(ex1); 2852 ext4_ext_mark_uninitialized(ex1);
2834 ex2 = &newex; 2853 ex2 = &newex;
2835 } 2854 }
2836 /* ex2: iblock to iblock + maxblocks-1 : initialised */ 2855 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2837 ex2->ee_block = cpu_to_le32(iblock); 2856 ex2->ee_block = cpu_to_le32(map->m_lblk);
2838 ext4_ext_store_pblock(ex2, newblock); 2857 ext4_ext_store_pblock(ex2, newblock);
2839 ex2->ee_len = cpu_to_le16(allocated); 2858 ex2->ee_len = cpu_to_le16(allocated);
2840 if (ex2 != ex) 2859 if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2877 goto out; 2896 goto out;
2878insert: 2897insert:
2879 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 2898 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2880 if (err == -ENOSPC) { 2899 if (err == -ENOSPC && may_zeroout) {
2881 err = ext4_ext_zeroout(inode, &orig_ex); 2900 err = ext4_ext_zeroout(inode, &orig_ex);
2882 if (err) 2901 if (err)
2883 goto fix_extent_len; 2902 goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
2904} 2923}
2905 2924
2906/* 2925/*
2907 * This function is called by ext4_ext_get_blocks() from 2926 * This function is called by ext4_ext_map_blocks() from
2908 * ext4_get_blocks_dio_write() when DIO to write 2927 * ext4_get_blocks_dio_write() when DIO to write
2909 * to an uninitialized extent. 2928 * to an uninitialized extent.
2910 * 2929 *
@@ -2927,9 +2946,8 @@ fix_extent_len:
2927 */ 2946 */
2928static int ext4_split_unwritten_extents(handle_t *handle, 2947static int ext4_split_unwritten_extents(handle_t *handle,
2929 struct inode *inode, 2948 struct inode *inode,
2949 struct ext4_map_blocks *map,
2930 struct ext4_ext_path *path, 2950 struct ext4_ext_path *path,
2931 ext4_lblk_t iblock,
2932 unsigned int max_blocks,
2933 int flags) 2951 int flags)
2934{ 2952{
2935 struct ext4_extent *ex, newex, orig_ex; 2953 struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2937 struct ext4_extent *ex2 = NULL; 2955 struct ext4_extent *ex2 = NULL;
2938 struct ext4_extent *ex3 = NULL; 2956 struct ext4_extent *ex3 = NULL;
2939 struct ext4_extent_header *eh; 2957 struct ext4_extent_header *eh;
2940 ext4_lblk_t ee_block; 2958 ext4_lblk_t ee_block, eof_block;
2941 unsigned int allocated, ee_len, depth; 2959 unsigned int allocated, ee_len, depth;
2942 ext4_fsblk_t newblock; 2960 ext4_fsblk_t newblock;
2943 int err = 0; 2961 int err = 0;
2962 int may_zeroout;
2963
2964 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2965 "block %llu, max_blocks %u\n", inode->i_ino,
2966 (unsigned long long)map->m_lblk, map->m_len);
2967
2968 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2969 inode->i_sb->s_blocksize_bits;
2970 if (eof_block < map->m_lblk + map->m_len)
2971 eof_block = map->m_lblk + map->m_len;
2944 2972
2945 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2946 "iblock %llu, max_blocks %u\n", inode->i_ino,
2947 (unsigned long long)iblock, max_blocks);
2948 depth = ext_depth(inode); 2973 depth = ext_depth(inode);
2949 eh = path[depth].p_hdr; 2974 eh = path[depth].p_hdr;
2950 ex = path[depth].p_ext; 2975 ex = path[depth].p_ext;
2951 ee_block = le32_to_cpu(ex->ee_block); 2976 ee_block = le32_to_cpu(ex->ee_block);
2952 ee_len = ext4_ext_get_actual_len(ex); 2977 ee_len = ext4_ext_get_actual_len(ex);
2953 allocated = ee_len - (iblock - ee_block); 2978 allocated = ee_len - (map->m_lblk - ee_block);
2954 newblock = iblock - ee_block + ext_pblock(ex); 2979 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2980
2955 ex2 = ex; 2981 ex2 = ex;
2956 orig_ex.ee_block = ex->ee_block; 2982 orig_ex.ee_block = ex->ee_block;
2957 orig_ex.ee_len = cpu_to_le16(ee_len); 2983 orig_ex.ee_len = cpu_to_le16(ee_len);
2958 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2984 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2959 2985
2960 /* 2986 /*
2987 * It is safe to convert extent to initialized via explicit
2988 * zeroout only if extent is fully insde i_size or new_size.
2989 */
2990 may_zeroout = ee_block + ee_len <= eof_block;
2991
2992 /*
2961 * If the uninitialized extent begins at the same logical 2993 * If the uninitialized extent begins at the same logical
2962 * block where the write begins, and the write completely 2994 * block where the write begins, and the write completely
2963 * covers the extent, then we don't need to split it. 2995 * covers the extent, then we don't need to split it.
2964 */ 2996 */
2965 if ((iblock == ee_block) && (allocated <= max_blocks)) 2997 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2966 return allocated; 2998 return allocated;
2967 2999
2968 err = ext4_ext_get_access(handle, inode, path + depth); 3000 err = ext4_ext_get_access(handle, inode, path + depth);
2969 if (err) 3001 if (err)
2970 goto out; 3002 goto out;
2971 /* ex1: ee_block to iblock - 1 : uninitialized */ 3003 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2972 if (iblock > ee_block) { 3004 if (map->m_lblk > ee_block) {
2973 ex1 = ex; 3005 ex1 = ex;
2974 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3006 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2975 ext4_ext_mark_uninitialized(ex1); 3007 ext4_ext_mark_uninitialized(ex1);
2976 ex2 = &newex; 3008 ex2 = &newex;
2977 } 3009 }
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2980 * we insert ex3, if ex1 is NULL. This is to avoid temporary 3012 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2981 * overlap of blocks. 3013 * overlap of blocks.
2982 */ 3014 */
2983 if (!ex1 && allocated > max_blocks) 3015 if (!ex1 && allocated > map->m_len)
2984 ex2->ee_len = cpu_to_le16(max_blocks); 3016 ex2->ee_len = cpu_to_le16(map->m_len);
2985 /* ex3: to ee_block + ee_len : uninitialised */ 3017 /* ex3: to ee_block + ee_len : uninitialised */
2986 if (allocated > max_blocks) { 3018 if (allocated > map->m_len) {
2987 unsigned int newdepth; 3019 unsigned int newdepth;
2988 ex3 = &newex; 3020 ex3 = &newex;
2989 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 3021 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2990 ext4_ext_store_pblock(ex3, newblock + max_blocks); 3022 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2991 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 3023 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2992 ext4_ext_mark_uninitialized(ex3); 3024 ext4_ext_mark_uninitialized(ex3);
2993 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 3025 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2994 if (err == -ENOSPC) { 3026 if (err == -ENOSPC && may_zeroout) {
2995 err = ext4_ext_zeroout(inode, &orig_ex); 3027 err = ext4_ext_zeroout(inode, &orig_ex);
2996 if (err) 3028 if (err)
2997 goto fix_extent_len; 3029 goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3001 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3033 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3002 ext4_ext_dirty(handle, inode, path + depth); 3034 ext4_ext_dirty(handle, inode, path + depth);
3003 /* zeroed the full extent */ 3035 /* zeroed the full extent */
3004 /* blocks available from iblock */ 3036 /* blocks available from map->m_lblk */
3005 return allocated; 3037 return allocated;
3006 3038
3007 } else if (err) 3039 } else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3015 * update the extent length after successful insert of the 3047 * update the extent length after successful insert of the
3016 * split extent 3048 * split extent
3017 */ 3049 */
3018 orig_ex.ee_len = cpu_to_le16(ee_len - 3050 ee_len -= ext4_ext_get_actual_len(ex3);
3019 ext4_ext_get_actual_len(ex3)); 3051 orig_ex.ee_len = cpu_to_le16(ee_len);
3052 may_zeroout = ee_block + ee_len <= eof_block;
3053
3020 depth = newdepth; 3054 depth = newdepth;
3021 ext4_ext_drop_refs(path); 3055 ext4_ext_drop_refs(path);
3022 path = ext4_ext_find_extent(inode, iblock, path); 3056 path = ext4_ext_find_extent(inode, map->m_lblk, path);
3023 if (IS_ERR(path)) { 3057 if (IS_ERR(path)) {
3024 err = PTR_ERR(path); 3058 err = PTR_ERR(path);
3025 goto out; 3059 goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3033 if (err) 3067 if (err)
3034 goto out; 3068 goto out;
3035 3069
3036 allocated = max_blocks; 3070 allocated = map->m_len;
3037 } 3071 }
3038 /* 3072 /*
3039 * If there was a change of depth as part of the 3073 * If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3042 */ 3076 */
3043 if (ex1 && ex1 != ex) { 3077 if (ex1 && ex1 != ex) {
3044 ex1 = ex; 3078 ex1 = ex;
3045 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3079 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3046 ext4_ext_mark_uninitialized(ex1); 3080 ext4_ext_mark_uninitialized(ex1);
3047 ex2 = &newex; 3081 ex2 = &newex;
3048 } 3082 }
3049 /* 3083 /*
3050 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, 3084 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3051 * uninitialised still. 3085 * using direct I/O, uninitialised still.
3052 */ 3086 */
3053 ex2->ee_block = cpu_to_le32(iblock); 3087 ex2->ee_block = cpu_to_le32(map->m_lblk);
3054 ext4_ext_store_pblock(ex2, newblock); 3088 ext4_ext_store_pblock(ex2, newblock);
3055 ex2->ee_len = cpu_to_le16(allocated); 3089 ex2->ee_len = cpu_to_le16(allocated);
3056 ext4_ext_mark_uninitialized(ex2); 3090 ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3062 goto out; 3096 goto out;
3063insert: 3097insert:
3064 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3098 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3065 if (err == -ENOSPC) { 3099 if (err == -ENOSPC && may_zeroout) {
3066 err = ext4_ext_zeroout(inode, &orig_ex); 3100 err = ext4_ext_zeroout(inode, &orig_ex);
3067 if (err) 3101 if (err)
3068 goto fix_extent_len; 3102 goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3152 3186
3153static int 3187static int
3154ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3188ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3155 ext4_lblk_t iblock, unsigned int max_blocks, 3189 struct ext4_map_blocks *map,
3156 struct ext4_ext_path *path, int flags, 3190 struct ext4_ext_path *path, int flags,
3157 unsigned int allocated, struct buffer_head *bh_result, 3191 unsigned int allocated, ext4_fsblk_t newblock)
3158 ext4_fsblk_t newblock)
3159{ 3192{
3160 int ret = 0; 3193 int ret = 0;
3161 int err = 0; 3194 int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3163 3196
3164 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3197 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3165 "block %llu, max_blocks %u, flags %d, allocated %u", 3198 "block %llu, max_blocks %u, flags %d, allocated %u",
3166 inode->i_ino, (unsigned long long)iblock, max_blocks, 3199 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3167 flags, allocated); 3200 flags, allocated);
3168 ext4_ext_show_leaf(inode, path); 3201 ext4_ext_show_leaf(inode, path);
3169 3202
3170 /* get_block() before submit the IO, split the extent */ 3203 /* get_block() before submit the IO, split the extent */
3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3204 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3172 ret = ext4_split_unwritten_extents(handle, 3205 ret = ext4_split_unwritten_extents(handle, inode, map,
3173 inode, path, iblock, 3206 path, flags);
3174 max_blocks, flags);
3175 /* 3207 /*
3176 * Flag the inode(non aio case) or end_io struct (aio case) 3208 * Flag the inode(non aio case) or end_io struct (aio case)
3177 * that this IO needs to convertion to written when IO is 3209 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3182 else 3214 else
3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3215 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode)) 3216 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result); 3217 map->m_flags |= EXT4_MAP_UNINIT;
3186 goto out; 3218 goto out;
3187 } 3219 }
3188 /* IO end_io complete, convert the filled extent to written */ 3220 /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3210 * the buffer head will be unmapped so that 3242 * the buffer head will be unmapped so that
3211 * a read from the block returns 0s. 3243 * a read from the block returns 0s.
3212 */ 3244 */
3213 set_buffer_unwritten(bh_result); 3245 map->m_flags |= EXT4_MAP_UNWRITTEN;
3214 goto out1; 3246 goto out1;
3215 } 3247 }
3216 3248
3217 /* buffered write, writepage time, convert*/ 3249 /* buffered write, writepage time, convert*/
3218 ret = ext4_ext_convert_to_initialized(handle, inode, 3250 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3219 path, iblock,
3220 max_blocks);
3221 if (ret >= 0) 3251 if (ret >= 0)
3222 ext4_update_inode_fsync_trans(handle, inode, 1); 3252 ext4_update_inode_fsync_trans(handle, inode, 1);
3223out: 3253out:
@@ -3226,7 +3256,7 @@ out:
3226 goto out2; 3256 goto out2;
3227 } else 3257 } else
3228 allocated = ret; 3258 allocated = ret;
3229 set_buffer_new(bh_result); 3259 map->m_flags |= EXT4_MAP_NEW;
3230 /* 3260 /*
3231 * if we allocated more blocks than requested 3261 * if we allocated more blocks than requested
3232 * we need to make sure we unmap the extra block 3262 * we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
3234 * unmapped later when we find the buffer_head marked 3264 * unmapped later when we find the buffer_head marked
3235 * new. 3265 * new.
3236 */ 3266 */
3237 if (allocated > max_blocks) { 3267 if (allocated > map->m_len) {
3238 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3268 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3239 newblock + max_blocks, 3269 newblock + map->m_len,
3240 allocated - max_blocks); 3270 allocated - map->m_len);
3241 allocated = max_blocks; 3271 allocated = map->m_len;
3242 } 3272 }
3243 3273
3244 /* 3274 /*
@@ -3252,13 +3282,13 @@ out:
3252 ext4_da_update_reserve_space(inode, allocated, 0); 3282 ext4_da_update_reserve_space(inode, allocated, 0);
3253 3283
3254map_out: 3284map_out:
3255 set_buffer_mapped(bh_result); 3285 map->m_flags |= EXT4_MAP_MAPPED;
3256out1: 3286out1:
3257 if (allocated > max_blocks) 3287 if (allocated > map->m_len)
3258 allocated = max_blocks; 3288 allocated = map->m_len;
3259 ext4_ext_show_leaf(inode, path); 3289 ext4_ext_show_leaf(inode, path);
3260 bh_result->b_bdev = inode->i_sb->s_bdev; 3290 map->m_pblk = newblock;
3261 bh_result->b_blocknr = newblock; 3291 map->m_len = allocated;
3262out2: 3292out2:
3263 if (path) { 3293 if (path) {
3264 ext4_ext_drop_refs(path); 3294 ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
3284 * 3314 *
3285 * return < 0, error case. 3315 * return < 0, error case.
3286 */ 3316 */
3287int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 3317int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3288 ext4_lblk_t iblock, 3318 struct ext4_map_blocks *map, int flags)
3289 unsigned int max_blocks, struct buffer_head *bh_result,
3290 int flags)
3291{ 3319{
3292 struct ext4_ext_path *path = NULL; 3320 struct ext4_ext_path *path = NULL;
3293 struct ext4_extent_header *eh; 3321 struct ext4_extent_header *eh;
3294 struct ext4_extent newex, *ex, *last_ex; 3322 struct ext4_extent newex, *ex, *last_ex;
3295 ext4_fsblk_t newblock; 3323 ext4_fsblk_t newblock;
3296 int err = 0, depth, ret, cache_type; 3324 int i, err = 0, depth, ret, cache_type;
3297 unsigned int allocated = 0; 3325 unsigned int allocated = 0;
3298 struct ext4_allocation_request ar; 3326 struct ext4_allocation_request ar;
3299 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3327 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3300 3328
3301 __clear_bit(BH_New, &bh_result->b_state);
3302 ext_debug("blocks %u/%u requested for inode %lu\n", 3329 ext_debug("blocks %u/%u requested for inode %lu\n",
3303 iblock, max_blocks, inode->i_ino); 3330 map->m_lblk, map->m_len, inode->i_ino);
3304 3331
3305 /* check in cache */ 3332 /* check in cache */
3306 cache_type = ext4_ext_in_cache(inode, iblock, &newex); 3333 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
3307 if (cache_type) { 3334 if (cache_type) {
3308 if (cache_type == EXT4_EXT_CACHE_GAP) { 3335 if (cache_type == EXT4_EXT_CACHE_GAP) {
3309 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3336 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3316 /* we should allocate requested block */ 3343 /* we should allocate requested block */
3317 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3344 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3318 /* block is already allocated */ 3345 /* block is already allocated */
3319 newblock = iblock 3346 newblock = map->m_lblk
3320 - le32_to_cpu(newex.ee_block) 3347 - le32_to_cpu(newex.ee_block)
3321 + ext_pblock(&newex); 3348 + ext_pblock(&newex);
3322 /* number of remaining blocks in the extent */ 3349 /* number of remaining blocks in the extent */
3323 allocated = ext4_ext_get_actual_len(&newex) - 3350 allocated = ext4_ext_get_actual_len(&newex) -
3324 (iblock - le32_to_cpu(newex.ee_block)); 3351 (map->m_lblk - le32_to_cpu(newex.ee_block));
3325 goto out; 3352 goto out;
3326 } else { 3353 } else {
3327 BUG(); 3354 BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3329 } 3356 }
3330 3357
3331 /* find extent for this block */ 3358 /* find extent for this block */
3332 path = ext4_ext_find_extent(inode, iblock, NULL); 3359 path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3333 if (IS_ERR(path)) { 3360 if (IS_ERR(path)) {
3334 err = PTR_ERR(path); 3361 err = PTR_ERR(path);
3335 path = NULL; 3362 path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3345 */ 3372 */
3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3373 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3347 EXT4_ERROR_INODE(inode, "bad extent address " 3374 EXT4_ERROR_INODE(inode, "bad extent address "
3348 "iblock: %d, depth: %d pblock %lld", 3375 "lblock: %lu, depth: %d pblock %lld",
3349 iblock, depth, path[depth].p_block); 3376 (unsigned long) map->m_lblk, depth,
3377 path[depth].p_block);
3350 err = -EIO; 3378 err = -EIO;
3351 goto out2; 3379 goto out2;
3352 } 3380 }
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3364 */ 3392 */
3365 ee_len = ext4_ext_get_actual_len(ex); 3393 ee_len = ext4_ext_get_actual_len(ex);
3366 /* if found extent covers block, simply return it */ 3394 /* if found extent covers block, simply return it */
3367 if (in_range(iblock, ee_block, ee_len)) { 3395 if (in_range(map->m_lblk, ee_block, ee_len)) {
3368 newblock = iblock - ee_block + ee_start; 3396 newblock = map->m_lblk - ee_block + ee_start;
3369 /* number of remaining blocks in the extent */ 3397 /* number of remaining blocks in the extent */
3370 allocated = ee_len - (iblock - ee_block); 3398 allocated = ee_len - (map->m_lblk - ee_block);
3371 ext_debug("%u fit into %u:%d -> %llu\n", iblock, 3399 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3372 ee_block, ee_len, newblock); 3400 ee_block, ee_len, newblock);
3373 3401
3374 /* Do not put uninitialized extent in the cache */ 3402 /* Do not put uninitialized extent in the cache */
3375 if (!ext4_ext_is_uninitialized(ex)) { 3403 if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3379 goto out; 3407 goto out;
3380 } 3408 }
3381 ret = ext4_ext_handle_uninitialized_extents(handle, 3409 ret = ext4_ext_handle_uninitialized_extents(handle,
3382 inode, iblock, max_blocks, path, 3410 inode, map, path, flags, allocated,
3383 flags, allocated, bh_result, newblock); 3411 newblock);
3384 return ret; 3412 return ret;
3385 } 3413 }
3386 } 3414 }
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3394 * put just found gap into cache to speed up 3422 * put just found gap into cache to speed up
3395 * subsequent requests 3423 * subsequent requests
3396 */ 3424 */
3397 ext4_ext_put_gap_in_cache(inode, path, iblock); 3425 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3398 goto out2; 3426 goto out2;
3399 } 3427 }
3400 /* 3428 /*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3402 */ 3430 */
3403 3431
3404 /* find neighbour allocated blocks */ 3432 /* find neighbour allocated blocks */
3405 ar.lleft = iblock; 3433 ar.lleft = map->m_lblk;
3406 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3434 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3407 if (err) 3435 if (err)
3408 goto out2; 3436 goto out2;
3409 ar.lright = iblock; 3437 ar.lright = map->m_lblk;
3410 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3438 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3411 if (err) 3439 if (err)
3412 goto out2; 3440 goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3417 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3445 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3418 * EXT_UNINIT_MAX_LEN. 3446 * EXT_UNINIT_MAX_LEN.
3419 */ 3447 */
3420 if (max_blocks > EXT_INIT_MAX_LEN && 3448 if (map->m_len > EXT_INIT_MAX_LEN &&
3421 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3449 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3422 max_blocks = EXT_INIT_MAX_LEN; 3450 map->m_len = EXT_INIT_MAX_LEN;
3423 else if (max_blocks > EXT_UNINIT_MAX_LEN && 3451 else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3424 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3452 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3425 max_blocks = EXT_UNINIT_MAX_LEN; 3453 map->m_len = EXT_UNINIT_MAX_LEN;
3426 3454
3427 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 3455 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3428 newex.ee_block = cpu_to_le32(iblock); 3456 newex.ee_block = cpu_to_le32(map->m_lblk);
3429 newex.ee_len = cpu_to_le16(max_blocks); 3457 newex.ee_len = cpu_to_le16(map->m_len);
3430 err = ext4_ext_check_overlap(inode, &newex, path); 3458 err = ext4_ext_check_overlap(inode, &newex, path);
3431 if (err) 3459 if (err)
3432 allocated = ext4_ext_get_actual_len(&newex); 3460 allocated = ext4_ext_get_actual_len(&newex);
3433 else 3461 else
3434 allocated = max_blocks; 3462 allocated = map->m_len;
3435 3463
3436 /* allocate new block */ 3464 /* allocate new block */
3437 ar.inode = inode; 3465 ar.inode = inode;
3438 ar.goal = ext4_ext_find_goal(inode, path, iblock); 3466 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3439 ar.logical = iblock; 3467 ar.logical = map->m_lblk;
3440 ar.len = allocated; 3468 ar.len = allocated;
3441 if (S_ISREG(inode->i_mode)) 3469 if (S_ISREG(inode->i_mode))
3442 ar.flags = EXT4_MB_HINT_DATA; 3470 ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3498 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3499 }
3472 if (ext4_should_dioread_nolock(inode)) 3500 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result); 3501 map->m_flags |= EXT4_MAP_UNINIT;
3474 } 3502 }
3475 3503
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { 3504 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
3477 if (unlikely(!eh->eh_entries)) { 3505 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode, 3506 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d", 3507 "eh->eh_entries == 0 and "
3480 ex->ee_block); 3508 "EOFBLOCKS_FL set");
3481 err = -EIO; 3509 err = -EIO;
3482 goto out2; 3510 goto out2;
3483 } 3511 }
3484 last_ex = EXT_LAST_EXTENT(eh); 3512 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) 3513 /*
3486 + ext4_ext_get_actual_len(last_ex)) 3514 * If the current leaf block was reached by looking at
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 3515 * the last index block all the way down the tree, and
3516 * we are extending the inode beyond the last extent
3517 * in the current leaf block, then clear the
3518 * EOFBLOCKS_FL flag.
3519 */
3520 for (i = depth-1; i >= 0; i--) {
3521 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3522 break;
3523 }
3524 if ((i < 0) &&
3525 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3526 ext4_ext_get_actual_len(last_ex)))
3527 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3488 } 3528 }
3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3529 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3490 if (err) { 3530 if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3500 /* previous routine could use block we allocated */ 3540 /* previous routine could use block we allocated */
3501 newblock = ext_pblock(&newex); 3541 newblock = ext_pblock(&newex);
3502 allocated = ext4_ext_get_actual_len(&newex); 3542 allocated = ext4_ext_get_actual_len(&newex);
3503 if (allocated > max_blocks) 3543 if (allocated > map->m_len)
3504 allocated = max_blocks; 3544 allocated = map->m_len;
3505 set_buffer_new(bh_result); 3545 map->m_flags |= EXT4_MAP_NEW;
3506 3546
3507 /* 3547 /*
3508 * Update reserved blocks/metadata blocks after successful 3548 * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3516 * when it is _not_ an uninitialized extent. 3556 * when it is _not_ an uninitialized extent.
3517 */ 3557 */
3518 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3558 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3519 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3559 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
3520 EXT4_EXT_CACHE_EXTENT); 3560 EXT4_EXT_CACHE_EXTENT);
3521 ext4_update_inode_fsync_trans(handle, inode, 1); 3561 ext4_update_inode_fsync_trans(handle, inode, 1);
3522 } else 3562 } else
3523 ext4_update_inode_fsync_trans(handle, inode, 0); 3563 ext4_update_inode_fsync_trans(handle, inode, 0);
3524out: 3564out:
3525 if (allocated > max_blocks) 3565 if (allocated > map->m_len)
3526 allocated = max_blocks; 3566 allocated = map->m_len;
3527 ext4_ext_show_leaf(inode, path); 3567 ext4_ext_show_leaf(inode, path);
3528 set_buffer_mapped(bh_result); 3568 map->m_flags |= EXT4_MAP_MAPPED;
3529 bh_result->b_bdev = inode->i_sb->s_bdev; 3569 map->m_pblk = newblock;
3530 bh_result->b_blocknr = newblock; 3570 map->m_len = allocated;
3531out2: 3571out2:
3532 if (path) { 3572 if (path) {
3533 ext4_ext_drop_refs(path); 3573 ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
3625 * can proceed even if the new size is the same as i_size. 3665 * can proceed even if the new size is the same as i_size.
3626 */ 3666 */
3627 if (new_size > i_size_read(inode)) 3667 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; 3668 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3629 } 3669 }
3630 3670
3631} 3671}
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
3640long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3680long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3641{ 3681{
3642 handle_t *handle; 3682 handle_t *handle;
3643 ext4_lblk_t block;
3644 loff_t new_size; 3683 loff_t new_size;
3645 unsigned int max_blocks; 3684 unsigned int max_blocks;
3646 int ret = 0; 3685 int ret = 0;
3647 int ret2 = 0; 3686 int ret2 = 0;
3648 int retries = 0; 3687 int retries = 0;
3649 struct buffer_head map_bh; 3688 struct ext4_map_blocks map;
3650 unsigned int credits, blkbits = inode->i_blkbits; 3689 unsigned int credits, blkbits = inode->i_blkbits;
3651 3690
3652 /* 3691 /*
3653 * currently supporting (pre)allocate mode for extent-based 3692 * currently supporting (pre)allocate mode for extent-based
3654 * files _only_ 3693 * files _only_
3655 */ 3694 */
3656 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3695 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3657 return -EOPNOTSUPP; 3696 return -EOPNOTSUPP;
3658 3697
3659 /* preallocation to directories is currently not supported */ 3698 /* preallocation to directories is currently not supported */
3660 if (S_ISDIR(inode->i_mode)) 3699 if (S_ISDIR(inode->i_mode))
3661 return -ENODEV; 3700 return -ENODEV;
3662 3701
3663 block = offset >> blkbits; 3702 map.m_lblk = offset >> blkbits;
3664 /* 3703 /*
3665 * We can't just convert len to max_blocks because 3704 * We can't just convert len to max_blocks because
3666 * If blocksize = 4096 offset = 3072 and len = 2048 3705 * If blocksize = 4096 offset = 3072 and len = 2048
3667 */ 3706 */
3668 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3707 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3669 - block; 3708 - map.m_lblk;
3670 /* 3709 /*
3671 * credits to insert 1 extent into extent tree 3710 * credits to insert 1 extent into extent tree
3672 */ 3711 */
3673 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3712 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3674 mutex_lock(&inode->i_mutex); 3713 mutex_lock(&inode->i_mutex);
3714 ret = inode_newsize_ok(inode, (len + offset));
3715 if (ret) {
3716 mutex_unlock(&inode->i_mutex);
3717 return ret;
3718 }
3675retry: 3719retry:
3676 while (ret >= 0 && ret < max_blocks) { 3720 while (ret >= 0 && ret < max_blocks) {
3677 block = block + ret; 3721 map.m_lblk = map.m_lblk + ret;
3678 max_blocks = max_blocks - ret; 3722 map.m_len = max_blocks = max_blocks - ret;
3679 handle = ext4_journal_start(inode, credits); 3723 handle = ext4_journal_start(inode, credits);
3680 if (IS_ERR(handle)) { 3724 if (IS_ERR(handle)) {
3681 ret = PTR_ERR(handle); 3725 ret = PTR_ERR(handle);
3682 break; 3726 break;
3683 } 3727 }
3684 map_bh.b_state = 0; 3728 ret = ext4_map_blocks(handle, inode, &map,
3685 ret = ext4_get_blocks(handle, inode, block,
3686 max_blocks, &map_bh,
3687 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3729 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3688 if (ret <= 0) { 3730 if (ret <= 0) {
3689#ifdef EXT4FS_DEBUG 3731#ifdef EXT4FS_DEBUG
3690 WARN_ON(ret <= 0); 3732 WARN_ON(ret <= 0);
3691 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3733 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3692 "returned error inode#%lu, block=%u, " 3734 "returned error inode#%lu, block=%u, "
3693 "max_blocks=%u", __func__, 3735 "max_blocks=%u", __func__,
3694 inode->i_ino, block, max_blocks); 3736 inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
3697 ret2 = ext4_journal_stop(handle); 3739 ret2 = ext4_journal_stop(handle);
3698 break; 3740 break;
3699 } 3741 }
3700 if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3742 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3701 blkbits) >> blkbits)) 3743 blkbits) >> blkbits))
3702 new_size = offset + len; 3744 new_size = offset + len;
3703 else 3745 else
3704 new_size = (block + ret) << blkbits; 3746 new_size = (map.m_lblk + ret) << blkbits;
3705 3747
3706 ext4_falloc_update_inode(inode, mode, new_size, 3748 ext4_falloc_update_inode(inode, mode, new_size,
3707 buffer_new(&map_bh)); 3749 (map.m_flags & EXT4_MAP_NEW));
3708 ext4_mark_inode_dirty(handle, inode); 3750 ext4_mark_inode_dirty(handle, inode);
3709 ret2 = ext4_journal_stop(handle); 3751 ret2 = ext4_journal_stop(handle);
3710 if (ret2) 3752 if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3733 ssize_t len) 3775 ssize_t len)
3734{ 3776{
3735 handle_t *handle; 3777 handle_t *handle;
3736 ext4_lblk_t block;
3737 unsigned int max_blocks; 3778 unsigned int max_blocks;
3738 int ret = 0; 3779 int ret = 0;
3739 int ret2 = 0; 3780 int ret2 = 0;
3740 struct buffer_head map_bh; 3781 struct ext4_map_blocks map;
3741 unsigned int credits, blkbits = inode->i_blkbits; 3782 unsigned int credits, blkbits = inode->i_blkbits;
3742 3783
3743 block = offset >> blkbits; 3784 map.m_lblk = offset >> blkbits;
3744 /* 3785 /*
3745 * We can't just convert len to max_blocks because 3786 * We can't just convert len to max_blocks because
3746 * If blocksize = 4096 offset = 3072 and len = 2048 3787 * If blocksize = 4096 offset = 3072 and len = 2048
3747 */ 3788 */
3748 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3789 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3749 - block; 3790 map.m_lblk);
3750 /* 3791 /*
3751 * credits to insert 1 extent into extent tree 3792 * credits to insert 1 extent into extent tree
3752 */ 3793 */
3753 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3794 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3754 while (ret >= 0 && ret < max_blocks) { 3795 while (ret >= 0 && ret < max_blocks) {
3755 block = block + ret; 3796 map.m_lblk += ret;
3756 max_blocks = max_blocks - ret; 3797 map.m_len = (max_blocks -= ret);
3757 handle = ext4_journal_start(inode, credits); 3798 handle = ext4_journal_start(inode, credits);
3758 if (IS_ERR(handle)) { 3799 if (IS_ERR(handle)) {
3759 ret = PTR_ERR(handle); 3800 ret = PTR_ERR(handle);
3760 break; 3801 break;
3761 } 3802 }
3762 map_bh.b_state = 0; 3803 ret = ext4_map_blocks(handle, inode, &map,
3763 ret = ext4_get_blocks(handle, inode, block,
3764 max_blocks, &map_bh,
3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3804 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3766 if (ret <= 0) { 3805 if (ret <= 0) {
3767 WARN_ON(ret <= 0); 3806 WARN_ON(ret <= 0);
3768 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3807 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3769 "returned error inode#%lu, block=%u, " 3808 "returned error inode#%lu, block=%u, "
3770 "max_blocks=%u", __func__, 3809 "max_blocks=%u", __func__,
3771 inode->i_ino, block, max_blocks); 3810 inode->i_ino, map.m_lblk, map.m_len);
3772 } 3811 }
3773 ext4_mark_inode_dirty(handle, inode); 3812 ext4_mark_inode_dirty(handle, inode);
3774 ret2 = ext4_journal_stop(handle); 3813 ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3898 int error = 0; 3937 int error = 0;
3899 3938
3900 /* fallback to generic here if not in extents fmt */ 3939 /* fallback to generic here if not in extents fmt */
3901 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3940 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3902 return generic_block_fiemap(inode, fieinfo, start, len, 3941 return generic_block_fiemap(inode, fieinfo, start, len,
3903 ext4_get_block); 3942 ext4_get_block);
3904 3943
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
66 * is smaller than s_maxbytes, which is for extent-mapped files. 66 * is smaller than s_maxbytes, which is for extent-mapped files.
67 */ 67 */
68 68
69 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 69 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
71 size_t length = iov_length(iov, nr_segs); 71 size_t length = iov_length(iov, nr_segs);
72 72
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ef3d980e67cb..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37/* 37/*
38 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since
40 * otherwise it will only be written by writeback, leaving a huge
41 * window during which a crash may lose the file. This may apply for
42 * the parent directory's parent as well, and so on recursively, if
43 * they are also freshly created.
44 */
45static void ext4_sync_parent(struct inode *inode)
46{
47 struct dentry *dentry = NULL;
48
49 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
50 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
51 dentry = list_entry(inode->i_dentry.next,
52 struct dentry, d_alias);
53 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
54 break;
55 inode = dentry->d_parent->d_inode;
56 sync_mapping_buffers(inode->i_mapping);
57 }
58}
59
60/*
38 * akpm: A new design for ext4_sync_file(). 61 * akpm: A new design for ext4_sync_file().
39 * 62 *
40 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). 63 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
48 * i_mutex lock is held when entering and exiting this function 71 * i_mutex lock is held when entering and exiting this function
49 */ 72 */
50 73
51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 74int ext4_sync_file(struct file *file, int datasync)
52{ 75{
53 struct inode *inode = dentry->d_inode; 76 struct inode *inode = file->f_mapping->host;
54 struct ext4_inode_info *ei = EXT4_I(inode); 77 struct ext4_inode_info *ei = EXT4_I(inode);
55 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 78 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
56 int ret; 79 int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
58 81
59 J_ASSERT(ext4_journal_current_handle() == NULL); 82 J_ASSERT(ext4_journal_current_handle() == NULL);
60 83
61 trace_ext4_sync_file(file, dentry, datasync); 84 trace_ext4_sync_file(file, datasync);
62 85
63 if (inode->i_sb->s_flags & MS_RDONLY) 86 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0; 87 return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
66 ret = flush_completed_IO(inode); 89 ret = flush_completed_IO(inode);
67 if (ret < 0) 90 if (ret < 0)
68 return ret; 91 return ret;
69 92
70 if (!journal) 93 if (!journal) {
71 return simple_fsync(file, dentry, datasync); 94 ret = generic_file_fsync(file, datasync);
95 if (!ret && !list_empty(&inode->i_dentry))
96 ext4_sync_parent(inode);
97 return ret;
98 }
72 99
73 /* 100 /*
74 * data=writeback,ordered: 101 * data=writeback,ordered:
@@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
102 (journal->j_flags & JBD2_BARRIER)) 129 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 NULL, BLKDEV_IFL_WAIT); 131 NULL, BLKDEV_IFL_WAIT);
105 jbd2_log_wait_commit(journal, commit_tid); 132 ret = jbd2_log_wait_commit(journal, commit_tid);
106 } else if (journal->j_flags & JBD2_BARRIER) 133 } else if (journal->j_flags & JBD2_BARRIER)
107 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
108 BLKDEV_IFL_WAIT); 135 BLKDEV_IFL_WAIT);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1a0e183a2f04..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
240 if (fatal) 240 if (fatal)
241 goto error_return; 241 goto error_return;
242 242
243 /* Ok, now we can actually update the inode bitmaps.. */ 243 fatal = -ESRCH;
244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 gdp = ext4_get_group_desc(sb, block_group, &bh2);
245 bit, bitmap_bh->b_data); 245 if (gdp) {
246 if (!cleared)
247 ext4_error(sb, "bit already cleared for inode %lu", ino);
248 else {
249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
250
251 BUFFER_TRACE(bh2, "get_write_access"); 246 BUFFER_TRACE(bh2, "get_write_access");
252 fatal = ext4_journal_get_write_access(handle, bh2); 247 fatal = ext4_journal_get_write_access(handle, bh2);
253 if (fatal) goto error_return; 248 }
254 249 ext4_lock_group(sb, block_group);
255 if (gdp) { 250 cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
256 ext4_lock_group(sb, block_group); 251 if (fatal || !cleared) {
257 count = ext4_free_inodes_count(sb, gdp) + 1; 252 ext4_unlock_group(sb, block_group);
258 ext4_free_inodes_set(sb, gdp, count); 253 goto out;
259 if (is_directory) { 254 }
260 count = ext4_used_dirs_count(sb, gdp) - 1;
261 ext4_used_dirs_set(sb, gdp, count);
262 if (sbi->s_log_groups_per_flex) {
263 ext4_group_t f;
264
265 f = ext4_flex_group(sbi, block_group);
266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
267 }
268 255
269 } 256 count = ext4_free_inodes_count(sb, gdp) + 1;
270 gdp->bg_checksum = ext4_group_desc_csum(sbi, 257 ext4_free_inodes_set(sb, gdp, count);
271 block_group, gdp); 258 if (is_directory) {
272 ext4_unlock_group(sb, block_group); 259 count = ext4_used_dirs_count(sb, gdp) - 1;
273 percpu_counter_inc(&sbi->s_freeinodes_counter); 260 ext4_used_dirs_set(sb, gdp, count);
274 if (is_directory) 261 percpu_counter_dec(&sbi->s_dirs_counter);
275 percpu_counter_dec(&sbi->s_dirs_counter);
276
277 if (sbi->s_log_groups_per_flex) {
278 ext4_group_t f;
279
280 f = ext4_flex_group(sbi, block_group);
281 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
282 }
283 }
284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
285 err = ext4_handle_dirty_metadata(handle, NULL, bh2);
286 if (!fatal) fatal = err;
287 } 262 }
288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); 263 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 264 ext4_unlock_group(sb, block_group);
290 if (!fatal) 265
291 fatal = err; 266 percpu_counter_inc(&sbi->s_freeinodes_counter);
292 sb->s_dirt = 1; 267 if (sbi->s_log_groups_per_flex) {
268 ext4_group_t f = ext4_flex_group(sbi, block_group);
269
270 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
271 if (is_directory)
272 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
273 }
274 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
275 fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
276out:
277 if (cleared) {
278 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
279 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
280 if (!fatal)
281 fatal = err;
282 sb->s_dirt = 1;
283 } else
284 ext4_error(sb, "bit already cleared for inode %lu", ino);
285
293error_return: 286error_return:
294 brelse(bitmap_bh); 287 brelse(bitmap_bh);
295 ext4_std_error(sb, fatal); 288 ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
499 492
500 if (S_ISDIR(mode) && 493 if (S_ISDIR(mode) &&
501 ((parent == sb->s_root->d_inode) || 494 ((parent == sb->s_root->d_inode) ||
502 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { 495 (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
503 int best_ndir = inodes_per_group; 496 int best_ndir = inodes_per_group;
504 int ret = -1; 497 int ret = -1;
505 498
@@ -1041,7 +1034,7 @@ got:
1041 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 1034 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1042 /* set extent flag only for directory, file and normal symlink*/ 1035 /* set extent flag only for directory, file and normal symlink*/
1043 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 1036 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
1044 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 1037 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
1045 ext4_ext_tree_init(handle, inode); 1038 ext4_ext_tree_init(handle, inode);
1046 } 1039 }
1047 } 1040 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e0f6af9d08d..19df61c321fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
149 int ret; 149 int ret;
150 150
151 /* 151 /*
152 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this 152 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
153 * moment, get_block can be called only for blocks inside i_size since 153 * moment, get_block can be called only for blocks inside i_size since
154 * page cache has been already dropped and writes are blocked by 154 * page cache has been already dropped and writes are blocked by
155 * i_mutex. So we can safely drop the i_data_sem here. 155 * i_mutex. So we can safely drop the i_data_sem here.
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
348 if (blk && 348 if (blk &&
349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
350 blk, 1))) { 350 blk, 1))) {
351 __ext4_error(inode->i_sb, function, 351 ext4_error_inode(function, inode,
352 "invalid block reference %u " 352 "invalid block reference %u", blk);
353 "in inode #%lu", blk, inode->i_ino);
354 return -EIO; 353 return -EIO;
355 } 354 }
356 } 355 }
@@ -785,7 +784,7 @@ failed:
785 /* Allocation failed, free what we already allocated */ 784 /* Allocation failed, free what we already allocated */
786 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 785 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
787 for (i = 1; i <= n ; i++) { 786 for (i = 1; i <= n ; i++) {
788 /* 787 /*
789 * branch[i].bh is newly allocated, so there is no 788 * branch[i].bh is newly allocated, so there is no
790 * need to revoke the block, which is why we don't 789 * need to revoke the block, which is why we don't
791 * need to set EXT4_FREE_BLOCKS_METADATA. 790 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
875 874
876err_out: 875err_out:
877 for (i = 1; i <= num; i++) { 876 for (i = 1; i <= num; i++) {
878 /* 877 /*
879 * branch[i].bh is newly allocated, so there is no 878 * branch[i].bh is newly allocated, so there is no
880 * need to revoke the block, which is why we don't 879 * need to revoke the block, which is why we don't
881 * need to set EXT4_FREE_BLOCKS_METADATA. 880 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +889,9 @@ err_out:
890} 889}
891 890
892/* 891/*
893 * The ext4_ind_get_blocks() function handles non-extents inodes 892 * The ext4_ind_map_blocks() function handles non-extents inodes
894 * (i.e., using the traditional indirect/double-indirect i_blocks 893 * (i.e., using the traditional indirect/double-indirect i_blocks
895 * scheme) for ext4_get_blocks(). 894 * scheme) for ext4_map_blocks().
896 * 895 *
897 * Allocation strategy is simple: if we have to allocate something, we will 896 * Allocation strategy is simple: if we have to allocate something, we will
898 * have to go the whole way to leaf. So let's do it before attaching anything 897 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +916,8 @@ err_out:
917 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 916 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
918 * blocks. 917 * blocks.
919 */ 918 */
920static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, 919static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
921 ext4_lblk_t iblock, unsigned int maxblocks, 920 struct ext4_map_blocks *map,
922 struct buffer_head *bh_result,
923 int flags) 921 int flags)
924{ 922{
925 int err = -EIO; 923 int err = -EIO;
@@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
933 int count = 0; 931 int count = 0;
934 ext4_fsblk_t first_block = 0; 932 ext4_fsblk_t first_block = 0;
935 933
936 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 934 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
937 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 935 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
938 depth = ext4_block_to_path(inode, iblock, offsets, 936 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
939 &blocks_to_boundary); 937 &blocks_to_boundary);
940 938
941 if (depth == 0) 939 if (depth == 0)
@@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
946 /* Simplest case - block found, no allocation needed */ 944 /* Simplest case - block found, no allocation needed */
947 if (!partial) { 945 if (!partial) {
948 first_block = le32_to_cpu(chain[depth - 1].key); 946 first_block = le32_to_cpu(chain[depth - 1].key);
949 clear_buffer_new(bh_result);
950 count++; 947 count++;
951 /*map more blocks*/ 948 /*map more blocks*/
952 while (count < maxblocks && count <= blocks_to_boundary) { 949 while (count < map->m_len && count <= blocks_to_boundary) {
953 ext4_fsblk_t blk; 950 ext4_fsblk_t blk;
954 951
955 blk = le32_to_cpu(*(chain[depth-1].p + count)); 952 blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
969 /* 966 /*
970 * Okay, we need to do block allocation. 967 * Okay, we need to do block allocation.
971 */ 968 */
972 goal = ext4_find_goal(inode, iblock, partial); 969 goal = ext4_find_goal(inode, map->m_lblk, partial);
973 970
974 /* the number of blocks need to allocate for [d,t]indirect blocks */ 971 /* the number of blocks need to allocate for [d,t]indirect blocks */
975 indirect_blks = (chain + depth) - partial - 1; 972 indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
979 * direct blocks to allocate for this branch. 976 * direct blocks to allocate for this branch.
980 */ 977 */
981 count = ext4_blks_to_allocate(partial, indirect_blks, 978 count = ext4_blks_to_allocate(partial, indirect_blks,
982 maxblocks, blocks_to_boundary); 979 map->m_len, blocks_to_boundary);
983 /* 980 /*
984 * Block out ext4_truncate while we alter the tree 981 * Block out ext4_truncate while we alter the tree
985 */ 982 */
986 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 983 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
987 &count, goal, 984 &count, goal,
988 offsets + (partial - chain), partial); 985 offsets + (partial - chain), partial);
989 986
@@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
995 * may need to return -EAGAIN upwards in the worst case. --sct 992 * may need to return -EAGAIN upwards in the worst case. --sct
996 */ 993 */
997 if (!err) 994 if (!err)
998 err = ext4_splice_branch(handle, inode, iblock, 995 err = ext4_splice_branch(handle, inode, map->m_lblk,
999 partial, indirect_blks, count); 996 partial, indirect_blks, count);
1000 if (err) 997 if (err)
1001 goto cleanup; 998 goto cleanup;
1002 999
1003 set_buffer_new(bh_result); 1000 map->m_flags |= EXT4_MAP_NEW;
1004 1001
1005 ext4_update_inode_fsync_trans(handle, inode, 1); 1002 ext4_update_inode_fsync_trans(handle, inode, 1);
1006got_it: 1003got_it:
1007 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1004 map->m_flags |= EXT4_MAP_MAPPED;
1005 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1006 map->m_len = count;
1008 if (count > blocks_to_boundary) 1007 if (count > blocks_to_boundary)
1009 set_buffer_boundary(bh_result); 1008 map->m_flags |= EXT4_MAP_BOUNDARY;
1010 err = count; 1009 err = count;
1011 /* Clean up and exit */ 1010 /* Clean up and exit */
1012 partial = chain + depth - 1; /* the whole chain */ 1011 partial = chain + depth - 1; /* the whole chain */
@@ -1016,7 +1015,6 @@ cleanup:
1016 brelse(partial->bh); 1015 brelse(partial->bh);
1017 partial--; 1016 partial--;
1018 } 1017 }
1019 BUFFER_TRACE(bh_result, "returned");
1020out: 1018out:
1021 return err; 1019 return err;
1022} 1020}
@@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1061 */ 1059 */
1062static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1060static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1063{ 1061{
1064 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1062 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1065 return ext4_ext_calc_metadata_amount(inode, lblock); 1063 return ext4_ext_calc_metadata_amount(inode, lblock);
1066 1064
1067 return ext4_indirect_calc_metadata_amount(inode, lblock); 1065 return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
1076{ 1074{
1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1075 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1078 struct ext4_inode_info *ei = EXT4_I(inode); 1076 struct ext4_inode_info *ei = EXT4_I(inode);
1079 int mdb_free = 0, allocated_meta_blocks = 0;
1080 1077
1081 spin_lock(&ei->i_block_reservation_lock); 1078 spin_lock(&ei->i_block_reservation_lock);
1082 trace_ext4_da_update_reserve_space(inode, used); 1079 trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
1091 1088
1092 /* Update per-inode reservations */ 1089 /* Update per-inode reservations */
1093 ei->i_reserved_data_blocks -= used; 1090 ei->i_reserved_data_blocks -= used;
1094 used += ei->i_allocated_meta_blocks;
1095 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 1091 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1096 allocated_meta_blocks = ei->i_allocated_meta_blocks; 1092 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1093 used + ei->i_allocated_meta_blocks);
1097 ei->i_allocated_meta_blocks = 0; 1094 ei->i_allocated_meta_blocks = 0;
1098 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1099 1095
1100 if (ei->i_reserved_data_blocks == 0) { 1096 if (ei->i_reserved_data_blocks == 0) {
1101 /* 1097 /*
@@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
1103 * only when we have written all of the delayed 1099 * only when we have written all of the delayed
1104 * allocation blocks. 1100 * allocation blocks.
1105 */ 1101 */
1106 mdb_free = ei->i_reserved_meta_blocks; 1102 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1103 ei->i_reserved_meta_blocks);
1107 ei->i_reserved_meta_blocks = 0; 1104 ei->i_reserved_meta_blocks = 0;
1108 ei->i_da_metadata_calc_len = 0; 1105 ei->i_da_metadata_calc_len = 0;
1109 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1110 } 1106 }
1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1107 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1112 1108
1113 /* Update quota subsystem */ 1109 /* Update quota subsystem for data blocks */
1114 if (quota_claim) { 1110 if (quota_claim)
1115 dquot_claim_block(inode, used); 1111 dquot_claim_block(inode, used);
1116 if (mdb_free) 1112 else {
1117 dquot_release_reservation_block(inode, mdb_free);
1118 } else {
1119 /* 1113 /*
1120 * We did fallocate with an offset that is already delayed 1114 * We did fallocate with an offset that is already delayed
1121 * allocated. So on delayed allocated writeback we should 1115 * allocated. So on delayed allocated writeback we should
1122 * not update the quota for allocated blocks. But then 1116 * not re-claim the quota for fallocated blocks.
1123 * converting an fallocate region to initialized region would
1124 * have caused a metadata allocation. So claim quota for
1125 * that
1126 */ 1117 */
1127 if (allocated_meta_blocks) 1118 dquot_release_reservation_block(inode, used);
1128 dquot_claim_block(inode, allocated_meta_blocks);
1129 dquot_release_reservation_block(inode, mdb_free + used);
1130 } 1119 }
1131 1120
1132 /* 1121 /*
@@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
1139 ext4_discard_preallocations(inode); 1128 ext4_discard_preallocations(inode);
1140} 1129}
1141 1130
1142static int check_block_validity(struct inode *inode, const char *msg, 1131static int check_block_validity(struct inode *inode, const char *func,
1143 sector_t logical, sector_t phys, int len) 1132 struct ext4_map_blocks *map)
1144{ 1133{
1145 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1134 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1146 __ext4_error(inode->i_sb, msg, 1135 map->m_len)) {
1147 "inode #%lu logical block %llu mapped to %llu " 1136 ext4_error_inode(func, inode,
1148 "(size %d)", inode->i_ino, 1137 "lblock %lu mapped to illegal pblock %llu "
1149 (unsigned long long) logical, 1138 "(length %d)", (unsigned long) map->m_lblk,
1150 (unsigned long long) phys, len); 1139 map->m_pblk, map->m_len);
1151 return -EIO; 1140 return -EIO;
1152 } 1141 }
1153 return 0; 1142 return 0;
@@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1212} 1201}
1213 1202
1214/* 1203/*
1215 * The ext4_get_blocks() function tries to look up the requested blocks, 1204 * The ext4_map_blocks() function tries to look up the requested blocks,
1216 * and returns if the blocks are already mapped. 1205 * and returns if the blocks are already mapped.
1217 * 1206 *
1218 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1207 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1219 * and store the allocated blocks in the result buffer head and mark it 1208 * and store the allocated blocks in the result buffer head and mark it
1220 * mapped. 1209 * mapped.
1221 * 1210 *
1222 * If file type is extents based, it will call ext4_ext_get_blocks(), 1211 * If file type is extents based, it will call ext4_ext_map_blocks(),
1223 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping 1212 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1224 * based files 1213 * based files
1225 * 1214 *
1226 * On success, it returns the number of blocks being mapped or allocate. 1215 * On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1233 * 1222 *
1234 * It returns the error in case of allocation failure. 1223 * It returns the error in case of allocation failure.
1235 */ 1224 */
1236int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, 1225int ext4_map_blocks(handle_t *handle, struct inode *inode,
1237 unsigned int max_blocks, struct buffer_head *bh, 1226 struct ext4_map_blocks *map, int flags)
1238 int flags)
1239{ 1227{
1240 int retval; 1228 int retval;
1241 1229
1242 clear_buffer_mapped(bh); 1230 map->m_flags = 0;
1243 clear_buffer_unwritten(bh); 1231 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1244 1232 "logical block %lu\n", inode->i_ino, flags, map->m_len,
1245 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," 1233 (unsigned long) map->m_lblk);
1246 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1247 (unsigned long)block);
1248 /* 1234 /*
1249 * Try to see if we can get the block without requesting a new 1235 * Try to see if we can get the block without requesting a new
1250 * file system block. 1236 * file system block.
1251 */ 1237 */
1252 down_read((&EXT4_I(inode)->i_data_sem)); 1238 down_read((&EXT4_I(inode)->i_data_sem));
1253 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1239 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1254 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1240 retval = ext4_ext_map_blocks(handle, inode, map, 0);
1255 bh, 0);
1256 } else { 1241 } else {
1257 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, 1242 retval = ext4_ind_map_blocks(handle, inode, map, 0);
1258 bh, 0);
1259 } 1243 }
1260 up_read((&EXT4_I(inode)->i_data_sem)); 1244 up_read((&EXT4_I(inode)->i_data_sem));
1261 1245
1262 if (retval > 0 && buffer_mapped(bh)) { 1246 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1263 int ret = check_block_validity(inode, "file system corruption", 1247 int ret = check_block_validity(inode, __func__, map);
1264 block, bh->b_blocknr, retval);
1265 if (ret != 0) 1248 if (ret != 0)
1266 return ret; 1249 return ret;
1267 } 1250 }
@@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1277 * ext4_ext_get_block() returns th create = 0 1260 * ext4_ext_get_block() returns th create = 0
1278 * with buffer head unmapped. 1261 * with buffer head unmapped.
1279 */ 1262 */
1280 if (retval > 0 && buffer_mapped(bh)) 1263 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1281 return retval; 1264 return retval;
1282 1265
1283 /* 1266 /*
@@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1290 * of BH_Unwritten and BH_Mapped flags being simultaneously 1273 * of BH_Unwritten and BH_Mapped flags being simultaneously
1291 * set on the buffer_head. 1274 * set on the buffer_head.
1292 */ 1275 */
1293 clear_buffer_unwritten(bh); 1276 map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1294 1277
1295 /* 1278 /*
1296 * New blocks allocate and/or writing to uninitialized extent 1279 * New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1312 * We need to check for EXT4 here because migrate 1295 * We need to check for EXT4 here because migrate
1313 * could have changed the inode type in between 1296 * could have changed the inode type in between
1314 */ 1297 */
1315 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1298 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1316 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1299 retval = ext4_ext_map_blocks(handle, inode, map, flags);
1317 bh, flags);
1318 } else { 1300 } else {
1319 retval = ext4_ind_get_blocks(handle, inode, block, 1301 retval = ext4_ind_map_blocks(handle, inode, map, flags);
1320 max_blocks, bh, flags);
1321 1302
1322 if (retval > 0 && buffer_new(bh)) { 1303 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1323 /* 1304 /*
1324 * We allocated new blocks which will result in 1305 * We allocated new blocks which will result in
1325 * i_data's format changing. Force the migrate 1306 * i_data's format changing. Force the migrate
@@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1342 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1323 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1343 1324
1344 up_write((&EXT4_I(inode)->i_data_sem)); 1325 up_write((&EXT4_I(inode)->i_data_sem));
1345 if (retval > 0 && buffer_mapped(bh)) { 1326 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1346 int ret = check_block_validity(inode, "file system " 1327 int ret = check_block_validity(inode,
1347 "corruption after allocation", 1328 "ext4_map_blocks_after_alloc",
1348 block, bh->b_blocknr, retval); 1329 map);
1349 if (ret != 0) 1330 if (ret != 0)
1350 return ret; 1331 return ret;
1351 } 1332 }
@@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1355/* Maximum number of blocks we map for direct IO at once. */ 1336/* Maximum number of blocks we map for direct IO at once. */
1356#define DIO_MAX_BLOCKS 4096 1337#define DIO_MAX_BLOCKS 4096
1357 1338
1358int ext4_get_block(struct inode *inode, sector_t iblock, 1339static int _ext4_get_block(struct inode *inode, sector_t iblock,
1359 struct buffer_head *bh_result, int create) 1340 struct buffer_head *bh, int flags)
1360{ 1341{
1361 handle_t *handle = ext4_journal_current_handle(); 1342 handle_t *handle = ext4_journal_current_handle();
1343 struct ext4_map_blocks map;
1362 int ret = 0, started = 0; 1344 int ret = 0, started = 0;
1363 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1364 int dio_credits; 1345 int dio_credits;
1365 1346
1366 if (create && !handle) { 1347 map.m_lblk = iblock;
1348 map.m_len = bh->b_size >> inode->i_blkbits;
1349
1350 if (flags && !handle) {
1367 /* Direct IO write... */ 1351 /* Direct IO write... */
1368 if (max_blocks > DIO_MAX_BLOCKS) 1352 if (map.m_len > DIO_MAX_BLOCKS)
1369 max_blocks = DIO_MAX_BLOCKS; 1353 map.m_len = DIO_MAX_BLOCKS;
1370 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1354 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1371 handle = ext4_journal_start(inode, dio_credits); 1355 handle = ext4_journal_start(inode, dio_credits);
1372 if (IS_ERR(handle)) { 1356 if (IS_ERR(handle)) {
1373 ret = PTR_ERR(handle); 1357 ret = PTR_ERR(handle);
1374 goto out; 1358 return ret;
1375 } 1359 }
1376 started = 1; 1360 started = 1;
1377 } 1361 }
1378 1362
1379 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 1363 ret = ext4_map_blocks(handle, inode, &map, flags);
1380 create ? EXT4_GET_BLOCKS_CREATE : 0);
1381 if (ret > 0) { 1364 if (ret > 0) {
1382 bh_result->b_size = (ret << inode->i_blkbits); 1365 map_bh(bh, inode->i_sb, map.m_pblk);
1366 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1367 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1383 ret = 0; 1368 ret = 0;
1384 } 1369 }
1385 if (started) 1370 if (started)
1386 ext4_journal_stop(handle); 1371 ext4_journal_stop(handle);
1387out:
1388 return ret; 1372 return ret;
1389} 1373}
1390 1374
1375int ext4_get_block(struct inode *inode, sector_t iblock,
1376 struct buffer_head *bh, int create)
1377{
1378 return _ext4_get_block(inode, iblock, bh,
1379 create ? EXT4_GET_BLOCKS_CREATE : 0);
1380}
1381
1391/* 1382/*
1392 * `handle' can be NULL if create is zero 1383 * `handle' can be NULL if create is zero
1393 */ 1384 */
1394struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1385struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1395 ext4_lblk_t block, int create, int *errp) 1386 ext4_lblk_t block, int create, int *errp)
1396{ 1387{
1397 struct buffer_head dummy; 1388 struct ext4_map_blocks map;
1389 struct buffer_head *bh;
1398 int fatal = 0, err; 1390 int fatal = 0, err;
1399 int flags = 0;
1400 1391
1401 J_ASSERT(handle != NULL || create == 0); 1392 J_ASSERT(handle != NULL || create == 0);
1402 1393
1403 dummy.b_state = 0; 1394 map.m_lblk = block;
1404 dummy.b_blocknr = -1000; 1395 map.m_len = 1;
1405 buffer_trace_init(&dummy.b_history); 1396 err = ext4_map_blocks(handle, inode, &map,
1406 if (create) 1397 create ? EXT4_GET_BLOCKS_CREATE : 0);
1407 flags |= EXT4_GET_BLOCKS_CREATE; 1398
1408 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); 1399 if (err < 0)
1409 /* 1400 *errp = err;
1410 * ext4_get_blocks() returns number of blocks mapped. 0 in 1401 if (err <= 0)
1411 * case of a HOLE. 1402 return NULL;
1412 */ 1403 *errp = 0;
1413 if (err > 0) { 1404
1414 if (err > 1) 1405 bh = sb_getblk(inode->i_sb, map.m_pblk);
1415 WARN_ON(1); 1406 if (!bh) {
1416 err = 0; 1407 *errp = -EIO;
1408 return NULL;
1417 } 1409 }
1418 *errp = err; 1410 if (map.m_flags & EXT4_MAP_NEW) {
1419 if (!err && buffer_mapped(&dummy)) { 1411 J_ASSERT(create != 0);
1420 struct buffer_head *bh; 1412 J_ASSERT(handle != NULL);
1421 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1422 if (!bh) {
1423 *errp = -EIO;
1424 goto err;
1425 }
1426 if (buffer_new(&dummy)) {
1427 J_ASSERT(create != 0);
1428 J_ASSERT(handle != NULL);
1429 1413
1430 /* 1414 /*
1431 * Now that we do not always journal data, we should 1415 * Now that we do not always journal data, we should
1432 * keep in mind whether this should always journal the 1416 * keep in mind whether this should always journal the
1433 * new buffer as metadata. For now, regular file 1417 * new buffer as metadata. For now, regular file
1434 * writes use ext4_get_block instead, so it's not a 1418 * writes use ext4_get_block instead, so it's not a
1435 * problem. 1419 * problem.
1436 */ 1420 */
1437 lock_buffer(bh); 1421 lock_buffer(bh);
1438 BUFFER_TRACE(bh, "call get_create_access"); 1422 BUFFER_TRACE(bh, "call get_create_access");
1439 fatal = ext4_journal_get_create_access(handle, bh); 1423 fatal = ext4_journal_get_create_access(handle, bh);
1440 if (!fatal && !buffer_uptodate(bh)) { 1424 if (!fatal && !buffer_uptodate(bh)) {
1441 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1425 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1442 set_buffer_uptodate(bh); 1426 set_buffer_uptodate(bh);
1443 }
1444 unlock_buffer(bh);
1445 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1446 err = ext4_handle_dirty_metadata(handle, inode, bh);
1447 if (!fatal)
1448 fatal = err;
1449 } else {
1450 BUFFER_TRACE(bh, "not a new buffer");
1451 }
1452 if (fatal) {
1453 *errp = fatal;
1454 brelse(bh);
1455 bh = NULL;
1456 } 1427 }
1457 return bh; 1428 unlock_buffer(bh);
1429 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1430 err = ext4_handle_dirty_metadata(handle, inode, bh);
1431 if (!fatal)
1432 fatal = err;
1433 } else {
1434 BUFFER_TRACE(bh, "not a new buffer");
1458 } 1435 }
1459err: 1436 if (fatal) {
1460 return NULL; 1437 *errp = fatal;
1438 brelse(bh);
1439 bh = NULL;
1440 }
1441 return bh;
1461} 1442}
1462 1443
1463struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1444struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1860 int retries = 0; 1841 int retries = 0;
1861 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1842 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1862 struct ext4_inode_info *ei = EXT4_I(inode); 1843 struct ext4_inode_info *ei = EXT4_I(inode);
1863 unsigned long md_needed, md_reserved; 1844 unsigned long md_needed;
1864 int ret; 1845 int ret;
1865 1846
1866 /* 1847 /*
@@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1870 */ 1851 */
1871repeat: 1852repeat:
1872 spin_lock(&ei->i_block_reservation_lock); 1853 spin_lock(&ei->i_block_reservation_lock);
1873 md_reserved = ei->i_reserved_meta_blocks;
1874 md_needed = ext4_calc_metadata_amount(inode, lblock); 1854 md_needed = ext4_calc_metadata_amount(inode, lblock);
1875 trace_ext4_da_reserve_space(inode, md_needed); 1855 trace_ext4_da_reserve_space(inode, md_needed);
1876 spin_unlock(&ei->i_block_reservation_lock); 1856 spin_unlock(&ei->i_block_reservation_lock);
1877 1857
1878 /* 1858 /*
1879 * Make quota reservation here to prevent quota overflow 1859 * We will charge metadata quota at writeout time; this saves
1880 * later. Real quota accounting is done at pages writeout 1860 * us from metadata over-estimation, though we may go over by
1881 * time. 1861 * a small amount in the end. Here we just reserve for data.
1882 */ 1862 */
1883 ret = dquot_reserve_block(inode, md_needed + 1); 1863 ret = dquot_reserve_block(inode, 1);
1884 if (ret) 1864 if (ret)
1885 return ret; 1865 return ret;
1886 1866 /*
1867 * We do still charge estimated metadata to the sb though;
1868 * we cannot afford to run out of free blocks.
1869 */
1887 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1870 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1888 dquot_release_reservation_block(inode, md_needed + 1); 1871 dquot_release_reservation_block(inode, 1);
1889 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1872 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1890 yield(); 1873 yield();
1891 goto repeat; 1874 goto repeat;
@@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1910 1893
1911 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1894 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1912 1895
1896 trace_ext4_da_release_space(inode, to_free);
1913 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1897 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1914 /* 1898 /*
1915 * if there aren't enough reserved blocks, then the 1899 * if there aren't enough reserved blocks, then the
@@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1932 * only when we have written all of the delayed 1916 * only when we have written all of the delayed
1933 * allocation blocks. 1917 * allocation blocks.
1934 */ 1918 */
1935 to_free += ei->i_reserved_meta_blocks; 1919 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1920 ei->i_reserved_meta_blocks);
1936 ei->i_reserved_meta_blocks = 0; 1921 ei->i_reserved_meta_blocks = 0;
1937 ei->i_da_metadata_calc_len = 0; 1922 ei->i_da_metadata_calc_len = 0;
1938 } 1923 }
1939 1924
1940 /* update fs dirty blocks counter */ 1925 /* update fs dirty data blocks counter */
1941 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1926 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1942 1927
1943 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1928 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2042/* 2027/*
2043 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2028 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
2044 * 2029 *
2045 * @mpd->inode - inode to walk through
2046 * @exbh->b_blocknr - first block on a disk
2047 * @exbh->b_size - amount of space in bytes
2048 * @logical - first logical block to start assignment with
2049 *
2050 * the function goes through all passed space and put actual disk 2030 * the function goes through all passed space and put actual disk
2051 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten 2031 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2052 */ 2032 */
2053static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 2033static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2054 struct buffer_head *exbh) 2034 struct ext4_map_blocks *map)
2055{ 2035{
2056 struct inode *inode = mpd->inode; 2036 struct inode *inode = mpd->inode;
2057 struct address_space *mapping = inode->i_mapping; 2037 struct address_space *mapping = inode->i_mapping;
2058 int blocks = exbh->b_size >> inode->i_blkbits; 2038 int blocks = map->m_len;
2059 sector_t pblock = exbh->b_blocknr, cur_logical; 2039 sector_t pblock = map->m_pblk, cur_logical;
2060 struct buffer_head *head, *bh; 2040 struct buffer_head *head, *bh;
2061 pgoff_t index, end; 2041 pgoff_t index, end;
2062 struct pagevec pvec; 2042 struct pagevec pvec;
2063 int nr_pages, i; 2043 int nr_pages, i;
2064 2044
2065 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2045 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2066 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2046 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2067 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2047 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2068 2048
2069 pagevec_init(&pvec, 0); 2049 pagevec_init(&pvec, 0);
@@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2090 2070
2091 /* skip blocks out of the range */ 2071 /* skip blocks out of the range */
2092 do { 2072 do {
2093 if (cur_logical >= logical) 2073 if (cur_logical >= map->m_lblk)
2094 break; 2074 break;
2095 cur_logical++; 2075 cur_logical++;
2096 } while ((bh = bh->b_this_page) != head); 2076 } while ((bh = bh->b_this_page) != head);
2097 2077
2098 do { 2078 do {
2099 if (cur_logical >= logical + blocks) 2079 if (cur_logical >= map->m_lblk + blocks)
2100 break; 2080 break;
2101 2081
2102 if (buffer_delay(bh) || 2082 if (buffer_delay(bh) || buffer_unwritten(bh)) {
2103 buffer_unwritten(bh)) {
2104 2083
2105 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); 2084 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2106 2085
@@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2119 } else if (buffer_mapped(bh)) 2098 } else if (buffer_mapped(bh))
2120 BUG_ON(bh->b_blocknr != pblock); 2099 BUG_ON(bh->b_blocknr != pblock);
2121 2100
2122 if (buffer_uninit(exbh)) 2101 if (map->m_flags & EXT4_MAP_UNINIT)
2123 set_buffer_uninit(bh); 2102 set_buffer_uninit(bh);
2124 cur_logical++; 2103 cur_logical++;
2125 pblock++; 2104 pblock++;
@@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2130} 2109}
2131 2110
2132 2111
2133/*
2134 * __unmap_underlying_blocks - just a helper function to unmap
2135 * set of blocks described by @bh
2136 */
2137static inline void __unmap_underlying_blocks(struct inode *inode,
2138 struct buffer_head *bh)
2139{
2140 struct block_device *bdev = inode->i_sb->s_bdev;
2141 int blocks, i;
2142
2143 blocks = bh->b_size >> inode->i_blkbits;
2144 for (i = 0; i < blocks; i++)
2145 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
2146}
2147
2148static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2112static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2149 sector_t logical, long blk_cnt) 2113 sector_t logical, long blk_cnt)
2150{ 2114{
@@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
2206static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2170static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2207{ 2171{
2208 int err, blks, get_blocks_flags; 2172 int err, blks, get_blocks_flags;
2209 struct buffer_head new; 2173 struct ext4_map_blocks map;
2210 sector_t next = mpd->b_blocknr; 2174 sector_t next = mpd->b_blocknr;
2211 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2175 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2212 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2176 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2247 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 2211 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2248 * variables are updated after the blocks have been allocated. 2212 * variables are updated after the blocks have been allocated.
2249 */ 2213 */
2250 new.b_state = 0; 2214 map.m_lblk = next;
2215 map.m_len = max_blocks;
2251 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2216 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2252 if (ext4_should_dioread_nolock(mpd->inode)) 2217 if (ext4_should_dioread_nolock(mpd->inode))
2253 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2218 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2254 if (mpd->b_state & (1 << BH_Delay)) 2219 if (mpd->b_state & (1 << BH_Delay))
2255 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2220 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2256 2221
2257 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2222 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2258 &new, get_blocks_flags);
2259 if (blks < 0) { 2223 if (blks < 0) {
2260 err = blks; 2224 err = blks;
2261 /* 2225 /*
@@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2282 ext4_msg(mpd->inode->i_sb, KERN_CRIT, 2246 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2283 "delayed block allocation failed for inode %lu at " 2247 "delayed block allocation failed for inode %lu at "
2284 "logical offset %llu with max blocks %zd with " 2248 "logical offset %llu with max blocks %zd with "
2285 "error %d\n", mpd->inode->i_ino, 2249 "error %d", mpd->inode->i_ino,
2286 (unsigned long long) next, 2250 (unsigned long long) next,
2287 mpd->b_size >> mpd->inode->i_blkbits, err); 2251 mpd->b_size >> mpd->inode->i_blkbits, err);
2288 printk(KERN_CRIT "This should not happen!! " 2252 printk(KERN_CRIT "This should not happen!! "
@@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2297 } 2261 }
2298 BUG_ON(blks == 0); 2262 BUG_ON(blks == 0);
2299 2263
2300 new.b_size = (blks << mpd->inode->i_blkbits); 2264 if (map.m_flags & EXT4_MAP_NEW) {
2265 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2266 int i;
2301 2267
2302 if (buffer_new(&new)) 2268 for (i = 0; i < map.m_len; i++)
2303 __unmap_underlying_blocks(mpd->inode, &new); 2269 unmap_underlying_metadata(bdev, map.m_pblk + i);
2270 }
2304 2271
2305 /* 2272 /*
2306 * If blocks are delayed marked, we need to 2273 * If blocks are delayed marked, we need to
@@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2308 */ 2275 */
2309 if ((mpd->b_state & (1 << BH_Delay)) || 2276 if ((mpd->b_state & (1 << BH_Delay)) ||
2310 (mpd->b_state & (1 << BH_Unwritten))) 2277 (mpd->b_state & (1 << BH_Unwritten)))
2311 mpage_put_bnr_to_bhs(mpd, next, &new); 2278 mpage_put_bnr_to_bhs(mpd, &map);
2312 2279
2313 if (ext4_should_order_data(mpd->inode)) { 2280 if (ext4_should_order_data(mpd->inode)) {
2314 err = ext4_jbd2_file_inode(handle, mpd->inode); 2281 err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2349 sector_t next; 2316 sector_t next;
2350 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2317 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2351 2318
2319 /*
2320 * XXX Don't go larger than mballoc is willing to allocate
2321 * This is a stopgap solution. We eventually need to fold
2322 * mpage_da_submit_io() into this function and then call
2323 * ext4_get_blocks() multiple times in a loop
2324 */
2325 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2326 goto flush_it;
2327
2352 /* check if thereserved journal credits might overflow */ 2328 /* check if thereserved journal credits might overflow */
2353 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2329 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2354 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2330 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2355 /* 2331 /*
2356 * With non-extent format we are limited by the journal 2332 * With non-extent format we are limited by the journal
@@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
2423 struct buffer_head *bh, *head; 2399 struct buffer_head *bh, *head;
2424 sector_t logical; 2400 sector_t logical;
2425 2401
2426 if (mpd->io_done) {
2427 /*
2428 * Rest of the page in the page_vec
2429 * redirty then and skip then. We will
2430 * try to write them again after
2431 * starting a new transaction
2432 */
2433 redirty_page_for_writepage(wbc, page);
2434 unlock_page(page);
2435 return MPAGE_DA_EXTENT_TAIL;
2436 }
2437 /* 2402 /*
2438 * Can we merge this page to current extent? 2403 * Can we merge this page to current extent?
2439 */ 2404 */
@@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
2528 * initialized properly. 2493 * initialized properly.
2529 */ 2494 */
2530static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2495static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2531 struct buffer_head *bh_result, int create) 2496 struct buffer_head *bh, int create)
2532{ 2497{
2498 struct ext4_map_blocks map;
2533 int ret = 0; 2499 int ret = 0;
2534 sector_t invalid_block = ~((sector_t) 0xffff); 2500 sector_t invalid_block = ~((sector_t) 0xffff);
2535 2501
@@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2537 invalid_block = ~0; 2503 invalid_block = ~0;
2538 2504
2539 BUG_ON(create == 0); 2505 BUG_ON(create == 0);
2540 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2506 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2507
2508 map.m_lblk = iblock;
2509 map.m_len = 1;
2541 2510
2542 /* 2511 /*
2543 * first, we need to know whether the block is allocated already 2512 * first, we need to know whether the block is allocated already
2544 * preallocated blocks are unmapped but should treated 2513 * preallocated blocks are unmapped but should treated
2545 * the same as allocated blocks. 2514 * the same as allocated blocks.
2546 */ 2515 */
2547 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); 2516 ret = ext4_map_blocks(NULL, inode, &map, 0);
2548 if ((ret == 0) && !buffer_delay(bh_result)) { 2517 if (ret < 0)
2549 /* the block isn't (pre)allocated yet, let's reserve space */ 2518 return ret;
2519 if (ret == 0) {
2520 if (buffer_delay(bh))
2521 return 0; /* Not sure this could or should happen */
2550 /* 2522 /*
2551 * XXX: __block_prepare_write() unmaps passed block, 2523 * XXX: __block_prepare_write() unmaps passed block,
2552 * is it OK? 2524 * is it OK?
@@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2556 /* not enough space to reserve */ 2528 /* not enough space to reserve */
2557 return ret; 2529 return ret;
2558 2530
2559 map_bh(bh_result, inode->i_sb, invalid_block); 2531 map_bh(bh, inode->i_sb, invalid_block);
2560 set_buffer_new(bh_result); 2532 set_buffer_new(bh);
2561 set_buffer_delay(bh_result); 2533 set_buffer_delay(bh);
2562 } else if (ret > 0) { 2534 return 0;
2563 bh_result->b_size = (ret << inode->i_blkbits);
2564 if (buffer_unwritten(bh_result)) {
2565 /* A delayed write to unwritten bh should
2566 * be marked new and mapped. Mapped ensures
2567 * that we don't do get_block multiple times
2568 * when we write to the same offset and new
2569 * ensures that we do proper zero out for
2570 * partial write.
2571 */
2572 set_buffer_new(bh_result);
2573 set_buffer_mapped(bh_result);
2574 }
2575 ret = 0;
2576 } 2535 }
2577 2536
2578 return ret; 2537 map_bh(bh, inode->i_sb, map.m_pblk);
2538 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2539
2540 if (buffer_unwritten(bh)) {
2541 /* A delayed write to unwritten bh should be marked
2542 * new and mapped. Mapped ensures that we don't do
2543 * get_block multiple times when we write to the same
2544 * offset and new ensures that we do proper zero out
2545 * for partial write.
2546 */
2547 set_buffer_new(bh);
2548 set_buffer_mapped(bh);
2549 }
2550 return 0;
2579} 2551}
2580 2552
2581/* 2553/*
@@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2597static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2569static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2598 struct buffer_head *bh_result, int create) 2570 struct buffer_head *bh_result, int create)
2599{ 2571{
2600 int ret = 0;
2601 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2602
2603 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2572 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2604 2573 return _ext4_get_block(inode, iblock, bh_result, 0);
2605 /*
2606 * we don't want to do block allocation in writepage
2607 * so call get_block_wrap with create = 0
2608 */
2609 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
2610 if (ret > 0) {
2611 bh_result->b_size = (ret << inode->i_blkbits);
2612 ret = 0;
2613 }
2614 return ret;
2615} 2574}
2616 2575
2617static int bget_one(handle_t *handle, struct buffer_head *bh) 2576static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2821 * number of contiguous block. So we will limit 2780 * number of contiguous block. So we will limit
2822 * number of contiguous block to a sane value 2781 * number of contiguous block to a sane value
2823 */ 2782 */
2824 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && 2783 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2825 (max_blocks > EXT4_MAX_TRANS_DATA)) 2784 (max_blocks > EXT4_MAX_TRANS_DATA))
2826 max_blocks = EXT4_MAX_TRANS_DATA; 2785 max_blocks = EXT4_MAX_TRANS_DATA;
2827 2786
2828 return ext4_chunk_trans_blocks(inode, max_blocks); 2787 return ext4_chunk_trans_blocks(inode, max_blocks);
2829} 2788}
2830 2789
2790/*
2791 * write_cache_pages_da - walk the list of dirty pages of the given
2792 * address space and call the callback function (which usually writes
2793 * the pages).
2794 *
2795 * This is a forked version of write_cache_pages(). Differences:
2796 * Range cyclic is ignored.
2797 * no_nrwrite_index_update is always presumed true
2798 */
2799static int write_cache_pages_da(struct address_space *mapping,
2800 struct writeback_control *wbc,
2801 struct mpage_da_data *mpd)
2802{
2803 int ret = 0;
2804 int done = 0;
2805 struct pagevec pvec;
2806 int nr_pages;
2807 pgoff_t index;
2808 pgoff_t end; /* Inclusive */
2809 long nr_to_write = wbc->nr_to_write;
2810
2811 pagevec_init(&pvec, 0);
2812 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2813 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2814
2815 while (!done && (index <= end)) {
2816 int i;
2817
2818 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2819 PAGECACHE_TAG_DIRTY,
2820 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2821 if (nr_pages == 0)
2822 break;
2823
2824 for (i = 0; i < nr_pages; i++) {
2825 struct page *page = pvec.pages[i];
2826
2827 /*
2828 * At this point, the page may be truncated or
2829 * invalidated (changing page->mapping to NULL), or
2830 * even swizzled back from swapper_space to tmpfs file
2831 * mapping. However, page->index will not change
2832 * because we have a reference on the page.
2833 */
2834 if (page->index > end) {
2835 done = 1;
2836 break;
2837 }
2838
2839 lock_page(page);
2840
2841 /*
2842 * Page truncated or invalidated. We can freely skip it
2843 * then, even for data integrity operations: the page
2844 * has disappeared concurrently, so there could be no
2845 * real expectation of this data interity operation
2846 * even if there is now a new, dirty page at the same
2847 * pagecache address.
2848 */
2849 if (unlikely(page->mapping != mapping)) {
2850continue_unlock:
2851 unlock_page(page);
2852 continue;
2853 }
2854
2855 if (!PageDirty(page)) {
2856 /* someone wrote it for us */
2857 goto continue_unlock;
2858 }
2859
2860 if (PageWriteback(page)) {
2861 if (wbc->sync_mode != WB_SYNC_NONE)
2862 wait_on_page_writeback(page);
2863 else
2864 goto continue_unlock;
2865 }
2866
2867 BUG_ON(PageWriteback(page));
2868 if (!clear_page_dirty_for_io(page))
2869 goto continue_unlock;
2870
2871 ret = __mpage_da_writepage(page, wbc, mpd);
2872 if (unlikely(ret)) {
2873 if (ret == AOP_WRITEPAGE_ACTIVATE) {
2874 unlock_page(page);
2875 ret = 0;
2876 } else {
2877 done = 1;
2878 break;
2879 }
2880 }
2881
2882 if (nr_to_write > 0) {
2883 nr_to_write--;
2884 if (nr_to_write == 0 &&
2885 wbc->sync_mode == WB_SYNC_NONE) {
2886 /*
2887 * We stop writing back only if we are
2888 * not doing integrity sync. In case of
2889 * integrity sync we have to keep going
2890 * because someone may be concurrently
2891 * dirtying pages, and we might have
2892 * synced a lot of newly appeared dirty
2893 * pages, but have not synced all of the
2894 * old dirty pages.
2895 */
2896 done = 1;
2897 break;
2898 }
2899 }
2900 }
2901 pagevec_release(&pvec);
2902 cond_resched();
2903 }
2904 return ret;
2905}
2906
2907
2831static int ext4_da_writepages(struct address_space *mapping, 2908static int ext4_da_writepages(struct address_space *mapping,
2832 struct writeback_control *wbc) 2909 struct writeback_control *wbc)
2833{ 2910{
@@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2836 handle_t *handle = NULL; 2913 handle_t *handle = NULL;
2837 struct mpage_da_data mpd; 2914 struct mpage_da_data mpd;
2838 struct inode *inode = mapping->host; 2915 struct inode *inode = mapping->host;
2839 int no_nrwrite_index_update;
2840 int pages_written = 0; 2916 int pages_written = 0;
2841 long pages_skipped; 2917 long pages_skipped;
2842 unsigned int max_pages; 2918 unsigned int max_pages;
@@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2916 mpd.wbc = wbc; 2992 mpd.wbc = wbc;
2917 mpd.inode = mapping->host; 2993 mpd.inode = mapping->host;
2918 2994
2919 /*
2920 * we don't want write_cache_pages to update
2921 * nr_to_write and writeback_index
2922 */
2923 no_nrwrite_index_update = wbc->no_nrwrite_index_update;
2924 wbc->no_nrwrite_index_update = 1;
2925 pages_skipped = wbc->pages_skipped; 2995 pages_skipped = wbc->pages_skipped;
2926 2996
2927retry: 2997retry:
@@ -2941,7 +3011,7 @@ retry:
2941 if (IS_ERR(handle)) { 3011 if (IS_ERR(handle)) {
2942 ret = PTR_ERR(handle); 3012 ret = PTR_ERR(handle);
2943 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 3013 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2944 "%ld pages, ino %lu; err %d\n", __func__, 3014 "%ld pages, ino %lu; err %d", __func__,
2945 wbc->nr_to_write, inode->i_ino, ret); 3015 wbc->nr_to_write, inode->i_ino, ret);
2946 goto out_writepages; 3016 goto out_writepages;
2947 } 3017 }
@@ -2963,8 +3033,7 @@ retry:
2963 mpd.io_done = 0; 3033 mpd.io_done = 0;
2964 mpd.pages_written = 0; 3034 mpd.pages_written = 0;
2965 mpd.retval = 0; 3035 mpd.retval = 0;
2966 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 3036 ret = write_cache_pages_da(mapping, wbc, &mpd);
2967 &mpd);
2968 /* 3037 /*
2969 * If we have a contiguous extent of pages and we 3038 * If we have a contiguous extent of pages and we
2970 * haven't done the I/O yet, map the blocks and submit 3039 * haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3085,7 @@ retry:
3016 if (pages_skipped != wbc->pages_skipped) 3085 if (pages_skipped != wbc->pages_skipped)
3017 ext4_msg(inode->i_sb, KERN_CRIT, 3086 ext4_msg(inode->i_sb, KERN_CRIT,
3018 "This should not happen leaving %s " 3087 "This should not happen leaving %s "
3019 "with nr_to_write = %ld ret = %d\n", 3088 "with nr_to_write = %ld ret = %d",
3020 __func__, wbc->nr_to_write, ret); 3089 __func__, wbc->nr_to_write, ret);
3021 3090
3022 /* Update index */ 3091 /* Update index */
@@ -3030,8 +3099,6 @@ retry:
3030 mapping->writeback_index = index; 3099 mapping->writeback_index = index;
3031 3100
3032out_writepages: 3101out_writepages:
3033 if (!no_nrwrite_index_update)
3034 wbc->no_nrwrite_index_update = 0;
3035 wbc->nr_to_write -= nr_to_writebump; 3102 wbc->nr_to_write -= nr_to_writebump;
3036 wbc->range_start = range_start; 3103 wbc->range_start = range_start;
3037 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3104 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3076 loff_t pos, unsigned len, unsigned flags, 3143 loff_t pos, unsigned len, unsigned flags,
3077 struct page **pagep, void **fsdata) 3144 struct page **pagep, void **fsdata)
3078{ 3145{
3079 int ret, retries = 0, quota_retries = 0; 3146 int ret, retries = 0;
3080 struct page *page; 3147 struct page *page;
3081 pgoff_t index; 3148 pgoff_t index;
3082 unsigned from, to; 3149 unsigned from, to;
@@ -3135,22 +3202,6 @@ retry:
3135 3202
3136 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3203 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3137 goto retry; 3204 goto retry;
3138
3139 if ((ret == -EDQUOT) &&
3140 EXT4_I(inode)->i_reserved_meta_blocks &&
3141 (quota_retries++ < 3)) {
3142 /*
3143 * Since we often over-estimate the number of meta
3144 * data blocks required, we may sometimes get a
3145 * spurios out of quota error even though there would
3146 * be enough space once we write the data blocks and
3147 * find out how many meta data blocks were _really_
3148 * required. So try forcing the inode write to see if
3149 * that helps.
3150 */
3151 write_inode_now(inode, (quota_retries == 3));
3152 goto retry;
3153 }
3154out: 3205out:
3155 return ret; 3206 return ret;
3156} 3207}
@@ -3546,46 +3597,18 @@ out:
3546 return ret; 3597 return ret;
3547} 3598}
3548 3599
3600/*
3601 * ext4_get_block used when preparing for a DIO write or buffer write.
3602 * We allocate an uinitialized extent if blocks haven't been allocated.
3603 * The extent will be converted to initialized after the IO is complete.
3604 */
3549static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3605static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3550 struct buffer_head *bh_result, int create) 3606 struct buffer_head *bh_result, int create)
3551{ 3607{
3552 handle_t *handle = ext4_journal_current_handle();
3553 int ret = 0;
3554 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3555 int dio_credits;
3556 int started = 0;
3557
3558 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3608 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3559 inode->i_ino, create); 3609 inode->i_ino, create);
3560 /* 3610 return _ext4_get_block(inode, iblock, bh_result,
3561 * ext4_get_block in prepare for a DIO write or buffer write. 3611 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3562 * We allocate an uinitialized extent if blocks haven't been allocated.
3563 * The extent will be converted to initialized after IO complete.
3564 */
3565 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3566
3567 if (!handle) {
3568 if (max_blocks > DIO_MAX_BLOCKS)
3569 max_blocks = DIO_MAX_BLOCKS;
3570 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3571 handle = ext4_journal_start(inode, dio_credits);
3572 if (IS_ERR(handle)) {
3573 ret = PTR_ERR(handle);
3574 goto out;
3575 }
3576 started = 1;
3577 }
3578
3579 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3580 create);
3581 if (ret > 0) {
3582 bh_result->b_size = (ret << inode->i_blkbits);
3583 ret = 0;
3584 }
3585 if (started)
3586 ext4_journal_stop(handle);
3587out:
3588 return ret;
3589} 3612}
3590 3613
3591static void dump_completed_IO(struct inode * inode) 3614static void dump_completed_IO(struct inode * inode)
@@ -3973,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3973 struct file *file = iocb->ki_filp; 3996 struct file *file = iocb->ki_filp;
3974 struct inode *inode = file->f_mapping->host; 3997 struct inode *inode = file->f_mapping->host;
3975 3998
3976 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 3999 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3977 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 4000 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3978 4001
3979 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 4002 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4302,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4302 4325
4303 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 4326 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4304 count)) { 4327 count)) {
4305 ext4_error(inode->i_sb, "inode #%lu: " 4328 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4306 "attempt to clear blocks %llu len %lu, invalid", 4329 "blocks %llu len %lu",
4307 inode->i_ino, (unsigned long long) block_to_free, 4330 (unsigned long long) block_to_free, count);
4308 count);
4309 return 1; 4331 return 1;
4310 } 4332 }
4311 4333
@@ -4410,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4410 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4432 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4411 ext4_handle_dirty_metadata(handle, inode, this_bh); 4433 ext4_handle_dirty_metadata(handle, inode, this_bh);
4412 else 4434 else
4413 ext4_error(inode->i_sb, 4435 EXT4_ERROR_INODE(inode,
4414 "circular indirect block detected, " 4436 "circular indirect block detected at "
4415 "inode=%lu, block=%llu", 4437 "block %llu",
4416 inode->i_ino, 4438 (unsigned long long) this_bh->b_blocknr);
4417 (unsigned long long) this_bh->b_blocknr);
4418 } 4439 }
4419} 4440}
4420 4441
@@ -4452,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4452 4473
4453 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 4474 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4454 nr, 1)) { 4475 nr, 1)) {
4455 ext4_error(inode->i_sb, 4476 EXT4_ERROR_INODE(inode,
4456 "indirect mapped block in inode " 4477 "invalid indirect mapped "
4457 "#%lu invalid (level %d, blk #%lu)", 4478 "block %lu (level %d)",
4458 inode->i_ino, depth, 4479 (unsigned long) nr, depth);
4459 (unsigned long) nr);
4460 break; 4480 break;
4461 } 4481 }
4462 4482
@@ -4468,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4468 * (should be rare). 4488 * (should be rare).
4469 */ 4489 */
4470 if (!bh) { 4490 if (!bh) {
4471 ext4_error(inode->i_sb, 4491 EXT4_ERROR_INODE(inode,
4472 "Read failure, inode=%lu, block=%llu", 4492 "Read failure block=%llu",
4473 inode->i_ino, nr); 4493 (unsigned long long) nr);
4474 continue; 4494 continue;
4475 } 4495 }
4476 4496
@@ -4612,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
4612 if (!ext4_can_truncate(inode)) 4632 if (!ext4_can_truncate(inode))
4613 return; 4633 return;
4614 4634
4615 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 4635 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4616 4636
4617 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4637 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4618 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4638 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4619 4639
4620 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4640 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4621 ext4_ext_truncate(inode); 4641 ext4_ext_truncate(inode);
4622 return; 4642 return;
4623 } 4643 }
@@ -4785,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4785 4805
4786 bh = sb_getblk(sb, block); 4806 bh = sb_getblk(sb, block);
4787 if (!bh) { 4807 if (!bh) {
4788 ext4_error(sb, "unable to read inode block - " 4808 EXT4_ERROR_INODE(inode, "unable to read inode block - "
4789 "inode=%lu, block=%llu", inode->i_ino, block); 4809 "block %llu", block);
4790 return -EIO; 4810 return -EIO;
4791 } 4811 }
4792 if (!buffer_uptodate(bh)) { 4812 if (!buffer_uptodate(bh)) {
@@ -4884,8 +4904,8 @@ make_io:
4884 submit_bh(READ_META, bh); 4904 submit_bh(READ_META, bh);
4885 wait_on_buffer(bh); 4905 wait_on_buffer(bh);
4886 if (!buffer_uptodate(bh)) { 4906 if (!buffer_uptodate(bh)) {
4887 ext4_error(sb, "unable to read inode block - inode=%lu," 4907 EXT4_ERROR_INODE(inode, "unable to read inode "
4888 " block=%llu", inode->i_ino, block); 4908 "block %llu", block);
4889 brelse(bh); 4909 brelse(bh);
4890 return -EIO; 4910 return -EIO;
4891 } 4911 }
@@ -5096,8 +5116,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5096 ret = 0; 5116 ret = 0;
5097 if (ei->i_file_acl && 5117 if (ei->i_file_acl &&
5098 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5118 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
5099 ext4_error(sb, "bad extended attribute block %llu inode #%lu", 5119 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
5100 ei->i_file_acl, inode->i_ino); 5120 ei->i_file_acl);
5101 ret = -EIO; 5121 ret = -EIO;
5102 goto bad_inode; 5122 goto bad_inode;
5103 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 5123 } else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5142,8 +5162,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5142 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5162 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5143 } else { 5163 } else {
5144 ret = -EIO; 5164 ret = -EIO;
5145 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", 5165 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5146 inode->i_mode, inode->i_ino);
5147 goto bad_inode; 5166 goto bad_inode;
5148 } 5167 }
5149 brelse(iloc.bh); 5168 brelse(iloc.bh);
@@ -5381,9 +5400,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5381 if (wbc->sync_mode == WB_SYNC_ALL) 5400 if (wbc->sync_mode == WB_SYNC_ALL)
5382 sync_dirty_buffer(iloc.bh); 5401 sync_dirty_buffer(iloc.bh);
5383 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5402 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5384 ext4_error(inode->i_sb, "IO error syncing inode, " 5403 EXT4_ERROR_INODE(inode,
5385 "inode=%lu, block=%llu", inode->i_ino, 5404 "IO error syncing inode (block=%llu)",
5386 (unsigned long long)iloc.bh->b_blocknr); 5405 (unsigned long long) iloc.bh->b_blocknr);
5387 err = -EIO; 5406 err = -EIO;
5388 } 5407 }
5389 brelse(iloc.bh); 5408 brelse(iloc.bh);
@@ -5455,7 +5474,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5455 } 5474 }
5456 5475
5457 if (attr->ia_valid & ATTR_SIZE) { 5476 if (attr->ia_valid & ATTR_SIZE) {
5458 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 5477 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5459 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5478 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5460 5479
5461 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5480 if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5468,7 +5487,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5468 if (S_ISREG(inode->i_mode) && 5487 if (S_ISREG(inode->i_mode) &&
5469 attr->ia_valid & ATTR_SIZE && 5488 attr->ia_valid & ATTR_SIZE &&
5470 (attr->ia_size < inode->i_size || 5489 (attr->ia_size < inode->i_size ||
5471 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { 5490 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5472 handle_t *handle; 5491 handle_t *handle;
5473 5492
5474 handle = ext4_journal_start(inode, 3); 5493 handle = ext4_journal_start(inode, 3);
@@ -5500,7 +5519,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5500 } 5519 }
5501 } 5520 }
5502 /* ext4_truncate will clear the flag */ 5521 /* ext4_truncate will clear the flag */
5503 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) 5522 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5504 ext4_truncate(inode); 5523 ext4_truncate(inode);
5505 } 5524 }
5506 5525
@@ -5576,7 +5595,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5576 5595
5577static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5596static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5578{ 5597{
5579 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 5598 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5580 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 5599 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5581 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 5600 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5582} 5601}
@@ -5911,9 +5930,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5911 */ 5930 */
5912 5931
5913 if (val) 5932 if (val)
5914 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5933 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5915 else 5934 else
5916 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5935 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5917 ext4_set_aops(inode); 5936 ext4_set_aops(inode);
5918 5937
5919 jbd2_journal_unlock_updates(journal); 5938 jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
258 if (me.moved_len > 0) 258 if (me.moved_len > 0)
259 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
260 260
261 if (copy_to_user((struct move_extent __user *)arg, 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me))) 262 &me, sizeof(me)))
263 err = -EFAULT; 263 err = -EFAULT;
264mext_out: 264mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
373 case EXT4_IOC32_SETRSVSZ: 373 case EXT4_IOC32_SETRSVSZ:
374 cmd = EXT4_IOC_SETRSVSZ; 374 cmd = EXT4_IOC_SETRSVSZ;
375 break; 375 break;
376 case EXT4_IOC_GROUP_ADD: 376 case EXT4_IOC32_GROUP_ADD: {
377 struct compat_ext4_new_group_input __user *uinput;
378 struct ext4_new_group_input input;
379 mm_segment_t old_fs;
380 int err;
381
382 uinput = compat_ptr(arg);
383 err = get_user(input.group, &uinput->group);
384 err |= get_user(input.block_bitmap, &uinput->block_bitmap);
385 err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
386 err |= get_user(input.inode_table, &uinput->inode_table);
387 err |= get_user(input.blocks_count, &uinput->blocks_count);
388 err |= get_user(input.reserved_blocks,
389 &uinput->reserved_blocks);
390 if (err)
391 return -EFAULT;
392 old_fs = get_fs();
393 set_fs(KERNEL_DS);
394 err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
395 (unsigned long) &input);
396 set_fs(old_fs);
397 return err;
398 }
399 case EXT4_IOC_MOVE_EXT:
377 break; 400 break;
378 default: 401 default:
379 return -ENOIOCTLCMD; 402 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
658 } 658 }
659} 659}
660 660
661/*
662 * Cache the order of the largest free extent we have available in this block
663 * group.
664 */
665static void
666mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
667{
668 int i;
669 int bits;
670
671 grp->bb_largest_free_order = -1; /* uninit */
672
673 bits = sb->s_blocksize_bits + 1;
674 for (i = bits; i >= 0; i--) {
675 if (grp->bb_counters[i] > 0) {
676 grp->bb_largest_free_order = i;
677 break;
678 }
679 }
680}
681
661static noinline_for_stack 682static noinline_for_stack
662void ext4_mb_generate_buddy(struct super_block *sb, 683void ext4_mb_generate_buddy(struct super_block *sb,
663 void *buddy, void *bitmap, ext4_group_t group) 684 void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
700 */ 721 */
701 grp->bb_free = free; 722 grp->bb_free = free;
702 } 723 }
724 mb_set_largest_free_order(sb, grp);
703 725
704 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 726 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
705 727
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
725 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 747 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
726 * So it can have information regarding groups_per_page which 748 * So it can have information regarding groups_per_page which
727 * is blocks_per_page/2 749 * is blocks_per_page/2
750 *
751 * Locking note: This routine takes the block group lock of all groups
752 * for this page; do not hold this lock when calling this routine!
728 */ 753 */
729 754
730static int ext4_mb_init_cache(struct page *page, char *incore) 755static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
865 BUG_ON(incore == NULL); 890 BUG_ON(incore == NULL);
866 mb_debug(1, "put buddy for group %u in page %lu/%x\n", 891 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
867 group, page->index, i * blocksize); 892 group, page->index, i * blocksize);
893 trace_ext4_mb_buddy_bitmap_load(sb, group);
868 grinfo = ext4_get_group_info(sb, group); 894 grinfo = ext4_get_group_info(sb, group);
869 grinfo->bb_fragments = 0; 895 grinfo->bb_fragments = 0;
870 memset(grinfo->bb_counters, 0, 896 memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
882 BUG_ON(incore != NULL); 908 BUG_ON(incore != NULL);
883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n", 909 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
884 group, page->index, i * blocksize); 910 group, page->index, i * blocksize);
911 trace_ext4_mb_bitmap_load(sb, group);
885 912
886 /* see comments in ext4_mb_put_pa() */ 913 /* see comments in ext4_mb_put_pa() */
887 ext4_lock_group(sb, group); 914 ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
910 return err; 937 return err;
911} 938}
912 939
940/*
941 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
942 * block group lock of all groups for this page; do not hold the BG lock when
943 * calling this routine!
944 */
913static noinline_for_stack 945static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 946int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{ 947{
@@ -1004,6 +1036,11 @@ err:
1004 return ret; 1036 return ret;
1005} 1037}
1006 1038
1039/*
1040 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1041 * block group lock of all groups for this page; do not hold the BG lock when
1042 * calling this routine!
1043 */
1007static noinline_for_stack int 1044static noinline_for_stack int
1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1045ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1009 struct ext4_buddy *e4b) 1046 struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
1150 return ret; 1187 return ret;
1151} 1188}
1152 1189
1153static void ext4_mb_release_desc(struct ext4_buddy *e4b) 1190static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1154{ 1191{
1155 if (e4b->bd_bitmap_page) 1192 if (e4b->bd_bitmap_page)
1156 page_cache_release(e4b->bd_bitmap_page); 1193 page_cache_release(e4b->bd_bitmap_page);
@@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1299 buddy = buddy2; 1336 buddy = buddy2;
1300 } while (1); 1337 } while (1);
1301 } 1338 }
1339 mb_set_largest_free_order(sb, e4b->bd_info);
1302 mb_check_buddy(e4b); 1340 mb_check_buddy(e4b);
1303} 1341}
1304 1342
@@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1427 e4b->bd_info->bb_counters[ord]++; 1465 e4b->bd_info->bb_counters[ord]++;
1428 e4b->bd_info->bb_counters[ord]++; 1466 e4b->bd_info->bb_counters[ord]++;
1429 } 1467 }
1468 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1430 1469
1431 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1470 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1432 mb_check_buddy(e4b); 1471 mb_check_buddy(e4b);
@@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1617 } 1656 }
1618 1657
1619 ext4_unlock_group(ac->ac_sb, group); 1658 ext4_unlock_group(ac->ac_sb, group);
1620 ext4_mb_release_desc(e4b); 1659 ext4_mb_unload_buddy(e4b);
1621 1660
1622 return 0; 1661 return 0;
1623} 1662}
@@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1672 ext4_mb_use_best_found(ac, e4b); 1711 ext4_mb_use_best_found(ac, e4b);
1673 } 1712 }
1674 ext4_unlock_group(ac->ac_sb, group); 1713 ext4_unlock_group(ac->ac_sb, group);
1675 ext4_mb_release_desc(e4b); 1714 ext4_mb_unload_buddy(e4b);
1676 1715
1677 return 0; 1716 return 0;
1678} 1717}
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1821 } 1860 }
1822} 1861}
1823 1862
1863/* This is now called BEFORE we load the buddy bitmap. */
1824static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1864static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1825 ext4_group_t group, int cr) 1865 ext4_group_t group, int cr)
1826{ 1866{
1827 unsigned free, fragments; 1867 unsigned free, fragments;
1828 unsigned i, bits;
1829 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1868 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1830 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1869 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1831 1870
1832 BUG_ON(cr < 0 || cr >= 4); 1871 BUG_ON(cr < 0 || cr >= 4);
1833 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1872
1873 /* We only do this if the grp has never been initialized */
1874 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1875 int ret = ext4_mb_init_group(ac->ac_sb, group);
1876 if (ret)
1877 return 0;
1878 }
1834 1879
1835 free = grp->bb_free; 1880 free = grp->bb_free;
1836 fragments = grp->bb_fragments; 1881 fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1843 case 0: 1888 case 0:
1844 BUG_ON(ac->ac_2order == 0); 1889 BUG_ON(ac->ac_2order == 0);
1845 1890
1891 if (grp->bb_largest_free_order < ac->ac_2order)
1892 return 0;
1893
1846 /* Avoid using the first bg of a flexgroup for data files */ 1894 /* Avoid using the first bg of a flexgroup for data files */
1847 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1895 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1848 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1896 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1849 ((group % flex_size) == 0)) 1897 ((group % flex_size) == 0))
1850 return 0; 1898 return 0;
1851 1899
1852 bits = ac->ac_sb->s_blocksize_bits + 1; 1900 return 1;
1853 for (i = ac->ac_2order; i <= bits; i++)
1854 if (grp->bb_counters[i] > 0)
1855 return 1;
1856 break;
1857 case 1: 1901 case 1:
1858 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1902 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1859 return 1; 1903 return 1;
@@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1964 sbi = EXT4_SB(sb); 2008 sbi = EXT4_SB(sb);
1965 ngroups = ext4_get_groups_count(sb); 2009 ngroups = ext4_get_groups_count(sb);
1966 /* non-extent files are limited to low blocks/groups */ 2010 /* non-extent files are limited to low blocks/groups */
1967 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) 2011 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
1968 ngroups = sbi->s_blockfile_groups; 2012 ngroups = sbi->s_blockfile_groups;
1969 2013
1970 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2014 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2024,15 +2068,11 @@ repeat:
2024 group = ac->ac_g_ex.fe_group; 2068 group = ac->ac_g_ex.fe_group;
2025 2069
2026 for (i = 0; i < ngroups; group++, i++) { 2070 for (i = 0; i < ngroups; group++, i++) {
2027 struct ext4_group_info *grp;
2028 struct ext4_group_desc *desc;
2029
2030 if (group == ngroups) 2071 if (group == ngroups)
2031 group = 0; 2072 group = 0;
2032 2073
2033 /* quick check to skip empty groups */ 2074 /* This now checks without needing the buddy page */
2034 grp = ext4_get_group_info(sb, group); 2075 if (!ext4_mb_good_group(ac, group, cr))
2035 if (grp->bb_free == 0)
2036 continue; 2076 continue;
2037 2077
2038 err = ext4_mb_load_buddy(sb, group, &e4b); 2078 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,15 +2080,18 @@ repeat:
2040 goto out; 2080 goto out;
2041 2081
2042 ext4_lock_group(sb, group); 2082 ext4_lock_group(sb, group);
2083
2084 /*
2085 * We need to check again after locking the
2086 * block group
2087 */
2043 if (!ext4_mb_good_group(ac, group, cr)) { 2088 if (!ext4_mb_good_group(ac, group, cr)) {
2044 /* someone did allocation from this group */
2045 ext4_unlock_group(sb, group); 2089 ext4_unlock_group(sb, group);
2046 ext4_mb_release_desc(&e4b); 2090 ext4_mb_unload_buddy(&e4b);
2047 continue; 2091 continue;
2048 } 2092 }
2049 2093
2050 ac->ac_groups_scanned++; 2094 ac->ac_groups_scanned++;
2051 desc = ext4_get_group_desc(sb, group, NULL);
2052 if (cr == 0) 2095 if (cr == 0)
2053 ext4_mb_simple_scan_group(ac, &e4b); 2096 ext4_mb_simple_scan_group(ac, &e4b);
2054 else if (cr == 1 && 2097 else if (cr == 1 &&
@@ -2058,7 +2101,7 @@ repeat:
2058 ext4_mb_complex_scan_group(ac, &e4b); 2101 ext4_mb_complex_scan_group(ac, &e4b);
2059 2102
2060 ext4_unlock_group(sb, group); 2103 ext4_unlock_group(sb, group);
2061 ext4_mb_release_desc(&e4b); 2104 ext4_mb_unload_buddy(&e4b);
2062 2105
2063 if (ac->ac_status != AC_STATUS_CONTINUE) 2106 if (ac->ac_status != AC_STATUS_CONTINUE)
2064 break; 2107 break;
@@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2148 ext4_lock_group(sb, group); 2191 ext4_lock_group(sb, group);
2149 memcpy(&sg, ext4_get_group_info(sb, group), i); 2192 memcpy(&sg, ext4_get_group_info(sb, group), i);
2150 ext4_unlock_group(sb, group); 2193 ext4_unlock_group(sb, group);
2151 ext4_mb_release_desc(&e4b); 2194 ext4_mb_unload_buddy(&e4b);
2152 2195
2153 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2196 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2154 sg.info.bb_fragments, sg.info.bb_first_free); 2197 sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2255 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2298 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2256 init_rwsem(&meta_group_info[i]->alloc_sem); 2299 init_rwsem(&meta_group_info[i]->alloc_sem);
2257 meta_group_info[i]->bb_free_root = RB_ROOT; 2300 meta_group_info[i]->bb_free_root = RB_ROOT;
2301 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
2258 2302
2259#ifdef DOUBLE_CHECK 2303#ifdef DOUBLE_CHECK
2260 { 2304 {
@@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2536 entry->count, entry->group, entry); 2580 entry->count, entry->group, entry);
2537 2581
2538 if (test_opt(sb, DISCARD)) { 2582 if (test_opt(sb, DISCARD)) {
2583 int ret;
2539 ext4_fsblk_t discard_block; 2584 ext4_fsblk_t discard_block;
2540 2585
2541 discard_block = entry->start_blk + 2586 discard_block = entry->start_blk +
@@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2543 trace_ext4_discard_blocks(sb, 2588 trace_ext4_discard_blocks(sb,
2544 (unsigned long long)discard_block, 2589 (unsigned long long)discard_block,
2545 entry->count); 2590 entry->count);
2546 sb_issue_discard(sb, discard_block, entry->count); 2591 ret = sb_issue_discard(sb, discard_block, entry->count);
2592 if (ret == EOPNOTSUPP) {
2593 ext4_warning(sb,
2594 "discard not supported, disabling");
2595 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2596 }
2547 } 2597 }
2548 2598
2549 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2568 } 2618 }
2569 ext4_unlock_group(sb, entry->group); 2619 ext4_unlock_group(sb, entry->group);
2570 kmem_cache_free(ext4_free_ext_cachep, entry); 2620 kmem_cache_free(ext4_free_ext_cachep, entry);
2571 ext4_mb_release_desc(&e4b); 2621 ext4_mb_unload_buddy(&e4b);
2572 } 2622 }
2573 2623
2574 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2624 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
2641 2691
2642void exit_ext4_mballoc(void) 2692void exit_ext4_mballoc(void)
2643{ 2693{
2644 /* 2694 /*
2645 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2646 * before destroying the slab cache. 2696 * before destroying the slab cache.
2647 */ 2697 */
@@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
2981 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3031 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
2982 atomic_inc(&sbi->s_bal_reqs); 3032 atomic_inc(&sbi->s_bal_reqs);
2983 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3033 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
2984 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) 3034 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
2985 atomic_inc(&sbi->s_bal_success); 3035 atomic_inc(&sbi->s_bal_success);
2986 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3036 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
2987 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3037 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3123 continue; 3173 continue;
3124 3174
3125 /* non-extent files can't have physical blocks past 2^32 */ 3175 /* non-extent files can't have physical blocks past 2^32 */
3126 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && 3176 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3127 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) 3177 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3128 continue; 3178 continue;
3129 3179
@@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3280 spin_unlock(&pa->pa_lock); 3330 spin_unlock(&pa->pa_lock);
3281 3331
3282 grp_blk = pa->pa_pstart; 3332 grp_blk = pa->pa_pstart;
3283 /* 3333 /*
3284 * If doing group-based preallocation, pa_pstart may be in the 3334 * If doing group-based preallocation, pa_pstart may be in the
3285 * next group when pa is used up 3335 * next group when pa is used up
3286 */ 3336 */
@@ -3697,7 +3747,7 @@ out:
3697 ext4_unlock_group(sb, group); 3747 ext4_unlock_group(sb, group);
3698 if (ac) 3748 if (ac)
3699 kmem_cache_free(ext4_ac_cachep, ac); 3749 kmem_cache_free(ext4_ac_cachep, ac);
3700 ext4_mb_release_desc(&e4b); 3750 ext4_mb_unload_buddy(&e4b);
3701 put_bh(bitmap_bh); 3751 put_bh(bitmap_bh);
3702 return free; 3752 return free;
3703} 3753}
@@ -3801,7 +3851,7 @@ repeat:
3801 if (bitmap_bh == NULL) { 3851 if (bitmap_bh == NULL) {
3802 ext4_error(sb, "Error reading block bitmap for %u", 3852 ext4_error(sb, "Error reading block bitmap for %u",
3803 group); 3853 group);
3804 ext4_mb_release_desc(&e4b); 3854 ext4_mb_unload_buddy(&e4b);
3805 continue; 3855 continue;
3806 } 3856 }
3807 3857
@@ -3810,7 +3860,7 @@ repeat:
3810 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3860 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
3811 ext4_unlock_group(sb, group); 3861 ext4_unlock_group(sb, group);
3812 3862
3813 ext4_mb_release_desc(&e4b); 3863 ext4_mb_unload_buddy(&e4b);
3814 put_bh(bitmap_bh); 3864 put_bh(bitmap_bh);
3815 3865
3816 list_del(&pa->u.pa_tmp_list); 3866 list_del(&pa->u.pa_tmp_list);
@@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4074 ext4_mb_release_group_pa(&e4b, pa, ac); 4124 ext4_mb_release_group_pa(&e4b, pa, ac);
4075 ext4_unlock_group(sb, group); 4125 ext4_unlock_group(sb, group);
4076 4126
4077 ext4_mb_release_desc(&e4b); 4127 ext4_mb_unload_buddy(&e4b);
4078 list_del(&pa->u.pa_tmp_list); 4128 list_del(&pa->u.pa_tmp_list);
4079 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4129 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4080 } 4130 }
@@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4484 if (!bh) 4534 if (!bh)
4485 tbh = sb_find_get_block(inode->i_sb, 4535 tbh = sb_find_get_block(inode->i_sb,
4486 block + i); 4536 block + i);
4487 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4537 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4488 inode, tbh, block + i); 4538 inode, tbh, block + i);
4489 } 4539 }
4490 } 4540 }
4491 4541
4492 /* 4542 /*
4493 * We need to make sure we don't reuse the freed block until 4543 * We need to make sure we don't reuse the freed block until
4494 * after the transaction is committed, which we can do by 4544 * after the transaction is committed, which we can do by
4495 * treating the block as metadata, below. We make an 4545 * treating the block as metadata, below. We make an
@@ -4610,7 +4660,7 @@ do_more:
4610 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4660 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4611 } 4661 }
4612 4662
4613 ext4_mb_release_desc(&e4b); 4663 ext4_mb_unload_buddy(&e4b);
4614 4664
4615 freed += count; 4665 freed += count;
4616 4666
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
475 */ 475 */
476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, 476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
477 EXT4_FEATURE_INCOMPAT_EXTENTS) || 477 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
478 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 478 (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
479 return -EINVAL; 479 return -EINVAL;
480 480
481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..3a6c92ac131c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
482 int depth = ext_depth(orig_inode); 482 int depth = ext_depth(orig_inode);
483 int ret; 483 int ret;
484 484
485 start_ext.ee_block = end_ext.ee_block = 0;
485 o_start = o_end = oext = orig_path[depth].p_ext; 486 o_start = o_end = oext = orig_path[depth].p_ext;
486 oext_alen = ext4_ext_get_actual_len(oext); 487 oext_alen = ext4_ext_get_actual_len(oext);
487 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
529 * new_ext |-------| 530 * new_ext |-------|
530 */ 531 */
531 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 532 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
532 ext4_error(orig_inode->i_sb, 533 EXT4_ERROR_INODE(orig_inode,
533 "new_ext_end(%u) should be less than or equal to " 534 "new_ext_end(%u) should be less than or equal to "
534 "oext->ee_block(%u) + oext_alen(%d) - 1", 535 "oext->ee_block(%u) + oext_alen(%d) - 1",
535 new_ext_end, le32_to_cpu(oext->ee_block), 536 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 while (1) { 693 while (1) {
693 /* The extent for donor must be found. */ 694 /* The extent for donor must be found. */
694 if (!dext) { 695 if (!dext) {
695 ext4_error(donor_inode->i_sb, 696 EXT4_ERROR_INODE(donor_inode,
696 "The extent for donor must be found"); 697 "The extent for donor must be found");
697 *err = -EIO; 698 *err = -EIO;
698 goto out; 699 goto out;
699 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 700 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
700 ext4_error(donor_inode->i_sb, 701 EXT4_ERROR_INODE(donor_inode,
701 "Donor offset(%u) and the first block of donor " 702 "Donor offset(%u) and the first block of donor "
702 "extent(%u) should be equal", 703 "extent(%u) should be equal",
703 donor_off, 704 donor_off,
@@ -976,11 +977,11 @@ mext_check_arguments(struct inode *orig_inode,
976 } 977 }
977 978
978 /* Ext4 move extent supports only extent based file */ 979 /* Ext4 move extent supports only extent based file */
979 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 980 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
980 ext4_debug("ext4 move extent: orig file is not extents " 981 ext4_debug("ext4 move extent: orig file is not extents "
981 "based file [ino:orig %lu]\n", orig_inode->i_ino); 982 "based file [ino:orig %lu]\n", orig_inode->i_ino);
982 return -EOPNOTSUPP; 983 return -EOPNOTSUPP;
983 } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { 984 } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
984 ext4_debug("ext4 move extent: donor file is not extents " 985 ext4_debug("ext4 move extent: donor file is not extents "
985 "based file [ino:donor %lu]\n", donor_inode->i_ino); 986 "based file [ino:donor %lu]\n", donor_inode->i_ino);
986 return -EOPNOTSUPP; 987 return -EOPNOTSUPP;
@@ -1354,7 +1355,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1354 if (ret1 < 0) 1355 if (ret1 < 0)
1355 break; 1356 break;
1356 if (*moved_len > len) { 1357 if (*moved_len > len) {
1357 ext4_error(orig_inode->i_sb, 1358 EXT4_ERROR_INODE(orig_inode,
1358 "We replaced blocks too much! " 1359 "We replaced blocks too much! "
1359 "sum of replaced: %llu requested: %llu", 1360 "sum of replaced: %llu requested: %llu",
1360 *moved_len, len); 1361 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
187 return blocksize; 187 return blocksize;
188 return (len & 65532) | ((len & 3) << 16); 188 return (len & 65532) | ((len & 3) << 16);
189} 189}
190 190
191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) 191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
192{ 192{
193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) 193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
197 if (len == blocksize) { 197 if (len == blocksize) {
198 if (blocksize == 65536) 198 if (blocksize == 65536)
199 return cpu_to_le16(EXT4_MAX_REC_LEN); 199 return cpu_to_le16(EXT4_MAX_REC_LEN);
200 else 200 else
201 return cpu_to_le16(0); 201 return cpu_to_le16(0);
202 } 202 }
203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); 203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
349 brelse(bh); 349 brelse(bh);
350 } 350 }
351 if (bcount) 351 if (bcount)
352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
353 levels ? "" : " ", names, space/bcount, 353 levels ? "" : " ", names, space/bcount,
354 (space/bcount)*100/blocksize); 354 (space/bcount)*100/blocksize);
355 return (struct stats) { names, space, bcount}; 355 return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
653 int ret, err; 653 int ret, err;
654 __u32 hashval; 654 __u32 hashval;
655 655
656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
657 start_hash, start_minor_hash)); 657 start_hash, start_minor_hash));
658 dir = dir_file->f_path.dentry->d_inode; 658 dir = dir_file->f_path.dentry->d_inode;
659 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 659 if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
661 if (hinfo.hash_version <= DX_HASH_TEA) 661 if (hinfo.hash_version <= DX_HASH_TEA)
662 hinfo.hash_version += 662 hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
801{ 801{
802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
803 EXT4_FEATURE_COMPAT_DIR_INDEX)) 803 EXT4_FEATURE_COMPAT_DIR_INDEX))
804 EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; 804 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
805} 805}
806 806
807/* 807/*
@@ -943,8 +943,8 @@ restart:
943 wait_on_buffer(bh); 943 wait_on_buffer(bh);
944 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
945 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
946 ext4_error(sb, "reading directory #%lu offset %lu", 946 EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
947 dir->i_ino, (unsigned long)block); 947 (unsigned long) block);
948 brelse(bh); 948 brelse(bh);
949 goto next; 949 goto next;
950 } 950 }
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1066 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1067 brelse(bh); 1067 brelse(bh);
1068 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1069 ext4_error(dir->i_sb, "bad inode number: %u", ino); 1069 EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1070 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1071 } 1071 }
1072 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1073 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1074 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1075 ext4_error(dir->i_sb, 1075 EXT4_ERROR_INODE(dir,
1076 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1077 ino); 1077 ino);
1078 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
1079 } else { 1079 } else {
1080 return ERR_CAST(inode); 1080 return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
1104 brelse(bh); 1104 brelse(bh);
1105 1105
1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1107 ext4_error(child->d_inode->i_sb, 1107 EXT4_ERROR_INODE(child->d_inode,
1108 "bad inode number: %u", ino); 1108 "bad parent inode number: %u", ino);
1109 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1110 } 1110 }
1111 1111
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1141 unsigned rec_len = 0; 1141 unsigned rec_len = 0;
1142 1142
1143 while (count--) { 1143 while (count--) {
1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1145 (from + (map->offs<<2)); 1145 (from + (map->offs<<2));
1146 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1146 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1147 memcpy (to, de, rec_len); 1147 memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1404 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1405 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1406 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1407 ext4_error(dir->i_sb, 1407 EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1408 "invalid rec_len for '..' in inode %lu",
1409 dir->i_ino);
1410 brelse(bh); 1408 brelse(bh);
1411 return -EIO; 1409 return -EIO;
1412 } 1410 }
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1418 brelse(bh); 1416 brelse(bh);
1419 return retval; 1417 return retval;
1420 } 1418 }
1421 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; 1419 ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1422 data1 = bh2->b_data; 1420 data1 = bh2->b_data;
1423 1421
1424 memcpy (data1, de, len); 1422 memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1491 retval = ext4_dx_add_entry(handle, dentry, inode); 1489 retval = ext4_dx_add_entry(handle, dentry, inode);
1492 if (!retval || (retval != ERR_BAD_DX_DIR)) 1490 if (!retval || (retval != ERR_BAD_DX_DIR))
1493 return retval; 1491 return retval;
1494 EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; 1492 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1495 dx_fallback++; 1493 dx_fallback++;
1496 ext4_mark_inode_dirty(handle, dir); 1494 ext4_mark_inode_dirty(handle, dir);
1497 } 1495 }
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1519 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1517 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1520 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1518 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1521 brelse(bh); 1519 brelse(bh);
1520 if (retval == 0)
1521 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1522 return retval; 1522 return retval;
1523} 1523}
1524 1524
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1917 if (err) 1917 if (err)
1918 ext4_error(inode->i_sb, 1918 EXT4_ERROR_INODE(inode,
1919 "error %d reading directory #%lu offset 0", 1919 "error %d reading directory lblock 0", err);
1920 err, inode->i_ino);
1921 else 1920 else
1922 ext4_warning(inode->i_sb, 1921 ext4_warning(inode->i_sb,
1923 "bad directory (dir #%lu) - no data block", 1922 "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
1941 de = ext4_next_entry(de1, sb->s_blocksize); 1940 de = ext4_next_entry(de1, sb->s_blocksize);
1942 while (offset < inode->i_size) { 1941 while (offset < inode->i_size) {
1943 if (!bh || 1942 if (!bh ||
1944 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1943 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1944 unsigned int lblock;
1945 err = 0; 1945 err = 0;
1946 brelse(bh); 1946 brelse(bh);
1947 bh = ext4_bread(NULL, inode, 1947 lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
1948 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1948 bh = ext4_bread(NULL, inode, lblock, 0, &err);
1949 if (!bh) { 1949 if (!bh) {
1950 if (err) 1950 if (err)
1951 ext4_error(sb, 1951 EXT4_ERROR_INODE(inode,
1952 "error %d reading directory" 1952 "error %d reading directory "
1953 " #%lu offset %u", 1953 "lblock %u", err, lblock);
1954 err, inode->i_ino, offset);
1955 offset += sb->s_blocksize; 1954 offset += sb->s_blocksize;
1956 continue; 1955 continue;
1957 } 1956 }
@@ -2297,7 +2296,7 @@ retry:
2297 } 2296 }
2298 } else { 2297 } else {
2299 /* clear the extent format for fast symlink */ 2298 /* clear the extent format for fast symlink */
2300 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2299 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2301 inode->i_op = &ext4_fast_symlink_inode_operations; 2300 inode->i_op = &ext4_fast_symlink_inode_operations;
2302 memcpy((char *)&EXT4_I(inode)->i_data, symname, l); 2301 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2303 inode->i_size = l-1; 2302 inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
911 percpu_counter_add(&sbi->s_freeinodes_counter, 911 percpu_counter_add(&sbi->s_freeinodes_counter,
912 EXT4_INODES_PER_GROUP(sb)); 912 EXT4_INODES_PER_GROUP(sb));
913 913
914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
915 sbi->s_log_groups_per_flex) {
915 ext4_group_t flex_group; 916 ext4_group_t flex_group;
916 flex_group = ext4_flex_group(sbi, input->group); 917 flex_group = ext4_flex_group(sbi, input->group);
917 atomic_add(input->free_blocks_count, 918 atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d5..4e8983a9811b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
241 if (sb->s_flags & MS_RDONLY) 241 if (sb->s_flags & MS_RDONLY)
242 return ERR_PTR(-EROFS); 242 return ERR_PTR(-EROFS);
243 243
244 vfs_check_frozen(sb, SB_FREEZE_WRITE);
244 /* Special case here: if the journal has aborted behind our 245 /* Special case here: if the journal has aborted behind our
245 * backs (eg. EIO in the commit thread), then we still need to 246 * backs (eg. EIO in the commit thread), then we still need to
246 * take the FS itself readonly cleanly. */ 247 * take the FS itself readonly cleanly. */
@@ -645,6 +646,8 @@ static void ext4_put_super(struct super_block *sb)
645 struct ext4_super_block *es = sbi->s_es; 646 struct ext4_super_block *es = sbi->s_es;
646 int i, err; 647 int i, err;
647 648
649 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
650
648 flush_workqueue(sbi->dio_unwritten_wq); 651 flush_workqueue(sbi->dio_unwritten_wq);
649 destroy_workqueue(sbi->dio_unwritten_wq); 652 destroy_workqueue(sbi->dio_unwritten_wq);
650 653
@@ -941,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
941 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); 944 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
942 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 945 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
943 seq_puts(seq, ",journal_async_commit"); 946 seq_puts(seq, ",journal_async_commit");
947 else if (test_opt(sb, JOURNAL_CHECKSUM))
948 seq_puts(seq, ",journal_checksum");
944 if (test_opt(sb, NOBH)) 949 if (test_opt(sb, NOBH))
945 seq_puts(seq, ",nobh"); 950 seq_puts(seq, ",nobh");
946 if (test_opt(sb, I_VERSION)) 951 if (test_opt(sb, I_VERSION))
@@ -1059,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot);
1059static int ext4_mark_dquot_dirty(struct dquot *dquot); 1064static int ext4_mark_dquot_dirty(struct dquot *dquot);
1060static int ext4_write_info(struct super_block *sb, int type); 1065static int ext4_write_info(struct super_block *sb, int type);
1061static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1066static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1062 char *path, int remount); 1067 char *path);
1063static int ext4_quota_on_mount(struct super_block *sb, int type); 1068static int ext4_quota_on_mount(struct super_block *sb, int type);
1064static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1069static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1065 size_t len, loff_t off); 1070 size_t len, loff_t off);
@@ -1081,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = {
1081 1086
1082static const struct quotactl_ops ext4_qctl_operations = { 1087static const struct quotactl_ops ext4_qctl_operations = {
1083 .quota_on = ext4_quota_on, 1088 .quota_on = ext4_quota_on,
1084 .quota_off = vfs_quota_off, 1089 .quota_off = dquot_quota_off,
1085 .quota_sync = vfs_quota_sync, 1090 .quota_sync = dquot_quota_sync,
1086 .get_info = vfs_get_dqinfo, 1091 .get_info = dquot_get_dqinfo,
1087 .set_info = vfs_set_dqinfo, 1092 .set_info = dquot_set_dqinfo,
1088 .get_dqblk = vfs_get_dqblk, 1093 .get_dqblk = dquot_get_dqblk,
1089 .set_dqblk = vfs_set_dqblk 1094 .set_dqblk = dquot_set_dqblk
1090}; 1095};
1091#endif 1096#endif
1092 1097
@@ -2051,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2051 /* Turn quotas off */ 2056 /* Turn quotas off */
2052 for (i = 0; i < MAXQUOTAS; i++) { 2057 for (i = 0; i < MAXQUOTAS; i++) {
2053 if (sb_dqopt(sb)->files[i]) 2058 if (sb_dqopt(sb)->files[i])
2054 vfs_quota_off(sb, i, 0); 2059 dquot_quota_off(sb, i);
2055 } 2060 }
2056#endif 2061#endif
2057 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 2062 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2213,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2213struct ext4_attr { 2218struct ext4_attr {
2214 struct attribute attr; 2219 struct attribute attr;
2215 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2220 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2216 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2221 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2217 const char *, size_t); 2222 const char *, size_t);
2218 int offset; 2223 int offset;
2219}; 2224};
@@ -2430,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2430 __releases(kernel_lock) 2435 __releases(kernel_lock)
2431 __acquires(kernel_lock) 2436 __acquires(kernel_lock)
2432{ 2437{
2438 char *orig_data = kstrdup(data, GFP_KERNEL);
2433 struct buffer_head *bh; 2439 struct buffer_head *bh;
2434 struct ext4_super_block *es = NULL; 2440 struct ext4_super_block *es = NULL;
2435 struct ext4_sb_info *sbi; 2441 struct ext4_sb_info *sbi;
@@ -2793,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2793 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2799 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2794 spin_lock_init(&sbi->s_next_gen_lock); 2800 spin_lock_init(&sbi->s_next_gen_lock);
2795 2801
2796 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2797 ext4_count_free_blocks(sb));
2798 if (!err) {
2799 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2800 ext4_count_free_inodes(sb));
2801 }
2802 if (!err) {
2803 err = percpu_counter_init(&sbi->s_dirs_counter,
2804 ext4_count_dirs(sb));
2805 }
2806 if (!err) {
2807 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2808 }
2809 if (err) {
2810 ext4_msg(sb, KERN_ERR, "insufficient memory");
2811 goto failed_mount3;
2812 }
2813
2814 sbi->s_stripe = ext4_get_stripe_size(sbi); 2802 sbi->s_stripe = ext4_get_stripe_size(sbi);
2815 sbi->s_max_writeback_mb_bump = 128; 2803 sbi->s_max_writeback_mb_bump = 128;
2816 2804
@@ -2910,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2910 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2898 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2911 2899
2912no_journal: 2900no_journal:
2901 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2902 ext4_count_free_blocks(sb));
2903 if (!err)
2904 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2905 ext4_count_free_inodes(sb));
2906 if (!err)
2907 err = percpu_counter_init(&sbi->s_dirs_counter,
2908 ext4_count_dirs(sb));
2909 if (!err)
2910 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2911 if (err) {
2912 ext4_msg(sb, KERN_ERR, "insufficient memory");
2913 goto failed_mount_wq;
2914 }
2913 if (test_opt(sb, NOBH)) { 2915 if (test_opt(sb, NOBH)) {
2914 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2916 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2915 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2917 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3001,7 +3003,7 @@ no_journal:
3001 err = ext4_setup_system_zone(sb); 3003 err = ext4_setup_system_zone(sb);
3002 if (err) { 3004 if (err) {
3003 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3005 ext4_msg(sb, KERN_ERR, "failed to initialize system "
3004 "zone (%d)\n", err); 3006 "zone (%d)", err);
3005 goto failed_mount4; 3007 goto failed_mount4;
3006 } 3008 }
3007 3009
@@ -3040,9 +3042,11 @@ no_journal:
3040 } else 3042 } else
3041 descr = "out journal"; 3043 descr = "out journal";
3042 3044
3043 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr); 3045 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
3046 "Opts: %s", descr, orig_data);
3044 3047
3045 lock_kernel(); 3048 lock_kernel();
3049 kfree(orig_data);
3046 return 0; 3050 return 0;
3047 3051
3048cantfind_ext4: 3052cantfind_ext4:
@@ -3059,6 +3063,10 @@ failed_mount_wq:
3059 jbd2_journal_destroy(sbi->s_journal); 3063 jbd2_journal_destroy(sbi->s_journal);
3060 sbi->s_journal = NULL; 3064 sbi->s_journal = NULL;
3061 } 3065 }
3066 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3067 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3068 percpu_counter_destroy(&sbi->s_dirs_counter);
3069 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3062failed_mount3: 3070failed_mount3:
3063 if (sbi->s_flex_groups) { 3071 if (sbi->s_flex_groups) {
3064 if (is_vmalloc_addr(sbi->s_flex_groups)) 3072 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3074,6 @@ failed_mount3:
3066 else 3074 else
3067 kfree(sbi->s_flex_groups); 3075 kfree(sbi->s_flex_groups);
3068 } 3076 }
3069 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3070 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3071 percpu_counter_destroy(&sbi->s_dirs_counter);
3072 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3073failed_mount2: 3077failed_mount2:
3074 for (i = 0; i < db_count; i++) 3078 for (i = 0; i < db_count; i++)
3075 brelse(sbi->s_group_desc[i]); 3079 brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3093,7 @@ out_fail:
3089 kfree(sbi->s_blockgroup_lock); 3093 kfree(sbi->s_blockgroup_lock);
3090 kfree(sbi); 3094 kfree(sbi);
3091 lock_kernel(); 3095 lock_kernel();
3096 kfree(orig_data);
3092 return ret; 3097 return ret;
3093} 3098}
3094 3099
@@ -3380,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3380 if (!(sb->s_flags & MS_RDONLY)) 3385 if (!(sb->s_flags & MS_RDONLY))
3381 es->s_wtime = cpu_to_le32(get_seconds()); 3386 es->s_wtime = cpu_to_le32(get_seconds());
3382 es->s_kbytes_written = 3387 es->s_kbytes_written =
3383 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3388 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3384 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3389 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3385 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 3390 EXT4_SB(sb)->s_sectors_written_start) >> 1));
3386 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3391 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3485,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb)
3485 return 0; 3490 return 0;
3486 3491
3487 journal = EXT4_SB(sb)->s_journal; 3492 journal = EXT4_SB(sb)->s_journal;
3488 if (journal) 3493 if (journal) {
3494 vfs_check_frozen(sb, SB_FREEZE_WRITE);
3489 ret = ext4_journal_force_commit(journal); 3495 ret = ext4_journal_force_commit(journal);
3496 }
3490 3497
3491 return ret; 3498 return ret;
3492} 3499}
@@ -3535,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb)
3535 * the journal. 3542 * the journal.
3536 */ 3543 */
3537 error = jbd2_journal_flush(journal); 3544 error = jbd2_journal_flush(journal);
3538 if (error < 0) { 3545 if (error < 0)
3539 out: 3546 goto out;
3540 jbd2_journal_unlock_updates(journal);
3541 return error;
3542 }
3543 3547
3544 /* Journal blocked and flushed, clear needs_recovery flag. */ 3548 /* Journal blocked and flushed, clear needs_recovery flag. */
3545 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3549 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3546 error = ext4_commit_super(sb, 1); 3550 error = ext4_commit_super(sb, 1);
3547 if (error) 3551out:
3548 goto out; 3552 /* we rely on s_frozen to stop further updates */
3549 return 0; 3553 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3554 return error;
3550} 3555}
3551 3556
3552/* 3557/*
@@ -3563,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb)
3563 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3568 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3564 ext4_commit_super(sb, 1); 3569 ext4_commit_super(sb, 1);
3565 unlock_super(sb); 3570 unlock_super(sb);
3566 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3567 return 0; 3571 return 0;
3568} 3572}
3569 3573
@@ -3574,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3574 ext4_fsblk_t n_blocks_count = 0; 3578 ext4_fsblk_t n_blocks_count = 0;
3575 unsigned long old_sb_flags; 3579 unsigned long old_sb_flags;
3576 struct ext4_mount_options old_opts; 3580 struct ext4_mount_options old_opts;
3581 int enable_quota = 0;
3577 ext4_group_t g; 3582 ext4_group_t g;
3578 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3583 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3579 int err; 3584 int err;
3580#ifdef CONFIG_QUOTA 3585#ifdef CONFIG_QUOTA
3581 int i; 3586 int i;
3582#endif 3587#endif
3588 char *orig_data = kstrdup(data, GFP_KERNEL);
3583 3589
3584 lock_kernel(); 3590 lock_kernel();
3585 3591
@@ -3630,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3630 } 3636 }
3631 3637
3632 if (*flags & MS_RDONLY) { 3638 if (*flags & MS_RDONLY) {
3639 err = dquot_suspend(sb, -1);
3640 if (err < 0)
3641 goto restore_opts;
3642
3633 /* 3643 /*
3634 * First of all, the unconditional stuff we have to do 3644 * First of all, the unconditional stuff we have to do
3635 * to disable replay of the journal when we next remount 3645 * to disable replay of the journal when we next remount
@@ -3698,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3698 goto restore_opts; 3708 goto restore_opts;
3699 if (!ext4_setup_super(sb, es, 0)) 3709 if (!ext4_setup_super(sb, es, 0))
3700 sb->s_flags &= ~MS_RDONLY; 3710 sb->s_flags &= ~MS_RDONLY;
3711 enable_quota = 1;
3701 } 3712 }
3702 } 3713 }
3703 ext4_setup_system_zone(sb); 3714 ext4_setup_system_zone(sb);
@@ -3713,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3713#endif 3724#endif
3714 unlock_super(sb); 3725 unlock_super(sb);
3715 unlock_kernel(); 3726 unlock_kernel();
3727 if (enable_quota)
3728 dquot_resume(sb, -1);
3729
3730 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
3731 kfree(orig_data);
3716 return 0; 3732 return 0;
3717 3733
3718restore_opts: 3734restore_opts:
@@ -3734,6 +3750,7 @@ restore_opts:
3734#endif 3750#endif
3735 unlock_super(sb); 3751 unlock_super(sb);
3736 unlock_kernel(); 3752 unlock_kernel();
3753 kfree(orig_data);
3737 return err; 3754 return err;
3738} 3755}
3739 3756
@@ -3906,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type)
3906 */ 3923 */
3907static int ext4_quota_on_mount(struct super_block *sb, int type) 3924static int ext4_quota_on_mount(struct super_block *sb, int type)
3908{ 3925{
3909 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], 3926 return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
3910 EXT4_SB(sb)->s_jquota_fmt, type); 3927 EXT4_SB(sb)->s_jquota_fmt, type);
3911} 3928}
3912 3929
3913/* 3930/*
3914 * Standard function to be called on quota_on 3931 * Standard function to be called on quota_on
3915 */ 3932 */
3916static int ext4_quota_on(struct super_block *sb, int type, int format_id, 3933static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3917 char *name, int remount) 3934 char *name)
3918{ 3935{
3919 int err; 3936 int err;
3920 struct path path; 3937 struct path path;
3921 3938
3922 if (!test_opt(sb, QUOTA)) 3939 if (!test_opt(sb, QUOTA))
3923 return -EINVAL; 3940 return -EINVAL;
3924 /* When remounting, no checks are needed and in fact, name is NULL */
3925 if (remount)
3926 return vfs_quota_on(sb, type, format_id, name, remount);
3927 3941
3928 err = kern_path(name, LOOKUP_FOLLOW, &path); 3942 err = kern_path(name, LOOKUP_FOLLOW, &path);
3929 if (err) 3943 if (err)
@@ -3962,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3962 } 3976 }
3963 } 3977 }
3964 3978
3965 err = vfs_quota_on_path(sb, type, format_id, &path); 3979 err = dquot_quota_on_path(sb, type, format_id, &path);
3966 path_put(&path); 3980 path_put(&path);
3967 return err; 3981 return err;
3968} 3982}
@@ -4141,6 +4155,7 @@ static int __init init_ext4_fs(void)
4141{ 4155{
4142 int err; 4156 int err;
4143 4157
4158 ext4_check_flag_values();
4144 err = init_ext4_system_zone(); 4159 err = init_ext4_system_zone();
4145 if (err) 4160 if (err)
4146 return err; 4161 return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext4_setattr,
37#ifdef CONFIG_EXT4_FS_XATTR 38#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 46const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 48 .follow_link = ext4_follow_link,
49 .setattr = ext4_setattr,
48#ifdef CONFIG_EXT4_FS_XATTR 50#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2de0e9515089..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: 230bad_block:
231 ext4_error(inode->i_sb, 231 EXT4_ERROR_INODE(inode, "bad block %llu",
232 "inode %lu: bad block %llu", inode->i_ino, 232 EXT4_I(inode)->i_file_acl);
233 EXT4_I(inode)->i_file_acl);
234 error = -EIO; 233 error = -EIO;
235 goto cleanup; 234 goto cleanup;
236 } 235 }
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
372 ea_bdebug(bh, "b_count=%d, refcount=%d", 371 ea_bdebug(bh, "b_count=%d, refcount=%d",
373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
374 if (ext4_xattr_check_block(bh)) { 373 if (ext4_xattr_check_block(bh)) {
375 ext4_error(inode->i_sb, 374 EXT4_ERROR_INODE(inode, "bad block %llu",
376 "inode %lu: bad block %llu", inode->i_ino, 375 EXT4_I(inode)->i_file_acl);
377 EXT4_I(inode)->i_file_acl);
378 error = -EIO; 376 error = -EIO;
379 goto cleanup; 377 goto cleanup;
380 } 378 }
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
666 atomic_read(&(bs->bh->b_count)), 664 atomic_read(&(bs->bh->b_count)),
667 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 665 le32_to_cpu(BHDR(bs->bh)->h_refcount));
668 if (ext4_xattr_check_block(bs->bh)) { 666 if (ext4_xattr_check_block(bs->bh)) {
669 ext4_error(sb, "inode %lu: bad block %llu", 667 EXT4_ERROR_INODE(inode, "bad block %llu",
670 inode->i_ino, EXT4_I(inode)->i_file_acl); 668 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 669 error = -EIO;
672 goto cleanup; 670 goto cleanup;
673 } 671 }
@@ -820,7 +818,7 @@ inserted:
820 EXT4_I(inode)->i_block_group); 818 EXT4_I(inode)->i_block_group);
821 819
822 /* non-extent files can't have physical blocks past 2^32 */ 820 /* non-extent files can't have physical blocks past 2^32 */
823 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 821 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
824 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 822 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
825 823
826 block = ext4_new_meta_blocks(handle, inode, 824 block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
828 if (error) 826 if (error)
829 goto cleanup; 827 goto cleanup;
830 828
831 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 829 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
832 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); 830 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
833 831
834 ea_idebug(inode, "creating block %d", block); 832 ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
880 goto cleanup; 878 goto cleanup;
881 879
882bad_block: 880bad_block:
883 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 881 EXT4_ERROR_INODE(inode, "bad block %llu",
884 inode->i_ino, EXT4_I(inode)->i_file_acl); 882 EXT4_I(inode)->i_file_acl);
885 goto cleanup; 883 goto cleanup;
886 884
887#undef header 885#undef header
@@ -1194,8 +1192,8 @@ retry:
1194 if (!bh) 1192 if (!bh)
1195 goto cleanup; 1193 goto cleanup;
1196 if (ext4_xattr_check_block(bh)) { 1194 if (ext4_xattr_check_block(bh)) {
1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1195 EXT4_ERROR_INODE(inode, "bad block %llu",
1198 inode->i_ino, EXT4_I(inode)->i_file_acl); 1196 EXT4_I(inode)->i_file_acl);
1199 error = -EIO; 1197 error = -EIO;
1200 goto cleanup; 1198 goto cleanup;
1201 } 1199 }
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1372 goto cleanup; 1370 goto cleanup;
1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1371 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1374 if (!bh) { 1372 if (!bh) {
1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error", 1373 EXT4_ERROR_INODE(inode, "block %llu read error",
1376 inode->i_ino, EXT4_I(inode)->i_file_acl); 1374 EXT4_I(inode)->i_file_acl);
1377 goto cleanup; 1375 goto cleanup;
1378 } 1376 }
1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1377 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1378 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1379 EXT4_ERROR_INODE(inode, "bad block %llu",
1382 inode->i_ino, EXT4_I(inode)->i_file_acl); 1380 EXT4_I(inode)->i_file_acl);
1383 goto cleanup; 1381 goto cleanup;
1384 } 1382 }
1385 ext4_xattr_release_block(handle, inode, bh); 1383 ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
1504 } 1502 }
1505 bh = sb_bread(inode->i_sb, ce->e_block); 1503 bh = sb_bread(inode->i_sb, ce->e_block);
1506 if (!bh) { 1504 if (!bh) {
1507 ext4_error(inode->i_sb, 1505 EXT4_ERROR_INODE(inode, "block %lu read error",
1508 "inode %lu: block %lu read error", 1506 (unsigned long) ce->e_block);
1509 inode->i_ino, (unsigned long) ce->e_block);
1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1507 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1511 EXT4_XATTR_REFCOUNT_MAX) { 1508 EXT4_XATTR_REFCOUNT_MAX) {
1512 ea_idebug(inode, "block %lu refcount %d>=%d", 1509 ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565d..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
242 while (*fclus < cluster) { 242 while (*fclus < cluster) {
243 /* prevent the infinite loop of cluster chain */ 243 /* prevent the infinite loop of cluster chain */
244 if (*fclus > limit) { 244 if (*fclus > limit) {
245 fat_fs_error(sb, "%s: detected the cluster chain loop" 245 fat_fs_error_ratelimit(sb,
246 " (i_pos %lld)", __func__, 246 "%s: detected the cluster chain loop"
247 MSDOS_I(inode)->i_pos); 247 " (i_pos %lld)", __func__,
248 MSDOS_I(inode)->i_pos);
248 nr = -EIO; 249 nr = -EIO;
249 goto out; 250 goto out;
250 } 251 }
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
253 if (nr < 0) 254 if (nr < 0)
254 goto out; 255 goto out;
255 else if (nr == FAT_ENT_FREE) { 256 else if (nr == FAT_ENT_FREE) {
256 fat_fs_error(sb, "%s: invalid cluster chain" 257 fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
257 " (i_pos %lld)", __func__, 258 " (i_pos %lld)", __func__,
258 MSDOS_I(inode)->i_pos); 259 MSDOS_I(inode)->i_pos);
259 nr = -EIO; 260 nr = -EIO;
260 goto out; 261 goto out;
261 } else if (nr == FAT_ENT_EOF) { 262 } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include <linux/compat.h> 20#include <linux/compat.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <linux/kernel.h>
22#include "fat.h" 23#include "fat.h"
23 24
24/* 25/*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
140{ 141{
141 const wchar_t *ip; 142 const wchar_t *ip;
142 wchar_t ec; 143 wchar_t ec;
143 unsigned char *op, nc; 144 unsigned char *op;
144 int charlen; 145 int charlen;
145 int k;
146 146
147 ip = uni; 147 ip = uni;
148 op = ascii; 148 op = ascii;
149 149
150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { 150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
151 ec = *ip++; 151 ec = *ip++;
152 if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { 152 if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
153 op += charlen; 153 op += charlen;
154 len -= charlen; 154 len -= charlen;
155 } else { 155 } else {
156 if (uni_xlate == 1) { 156 if (uni_xlate == 1) {
157 *op = ':'; 157 *op++ = ':';
158 for (k = 4; k > 0; k--) { 158 op = pack_hex_byte(op, ec >> 8);
159 nc = ec & 0xF; 159 op = pack_hex_byte(op, ec);
160 op[k] = nc > 9 ? nc + ('a' - 10)
161 : nc + '0';
162 ec >>= 4;
163 }
164 op += 5;
165 len -= 5; 160 len -= 5;
166 } else { 161 } else {
167 *op++ = '?'; 162 *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
758 return ret; 753 return ret;
759} 754}
760 755
761static int fat_dir_ioctl(struct inode *inode, struct file *filp, 756static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
762 unsigned int cmd, unsigned long arg) 757 unsigned long arg)
763{ 758{
759 struct inode *inode = filp->f_path.dentry->d_inode;
764 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; 760 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
765 int short_only, both; 761 int short_only, both;
766 762
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
774 both = 1; 770 both = 1;
775 break; 771 break;
776 default: 772 default:
777 return fat_generic_ioctl(inode, filp, cmd, arg); 773 return fat_generic_ioctl(filp, cmd, arg);
778 } 774 }
779 775
780 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) 776 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
814 both = 1; 810 both = 1;
815 break; 811 break;
816 default: 812 default:
817 return -ENOIOCTLCMD; 813 return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
818 } 814 }
819 815
820 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) 816 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
836 .llseek = generic_file_llseek, 832 .llseek = generic_file_llseek,
837 .read = generic_read_dir, 833 .read = generic_read_dir,
838 .readdir = fat_readdir, 834 .readdir = fat_readdir,
839 .ioctl = fat_dir_ioctl, 835 .unlocked_ioctl = fat_dir_ioctl,
840#ifdef CONFIG_COMPAT 836#ifdef CONFIG_COMPAT
841 .compat_ioctl = fat_compat_dir_ioctl, 837 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 838#endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..27ac25725954 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
6#include <linux/nls.h> 6#include <linux/nls.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/ratelimit.h>
9#include <linux/msdos_fs.h> 10#include <linux/msdos_fs.h>
10 11
11/* 12/*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
82 struct fatent_operations *fatent_ops; 83 struct fatent_operations *fatent_ops;
83 struct inode *fat_inode; 84 struct inode *fat_inode;
84 85
86 struct ratelimit_state ratelimit;
87
85 spinlock_t inode_hash_lock; 88 spinlock_t inode_hash_lock;
86 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 89 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
87}; 90};
@@ -298,16 +301,16 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
298extern int fat_count_free_clusters(struct super_block *sb); 301extern int fat_count_free_clusters(struct super_block *sb);
299 302
300/* fat/file.c */ 303/* fat/file.c */
301extern int fat_generic_ioctl(struct inode *inode, struct file *filp, 304extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
302 unsigned int cmd, unsigned long arg); 305 unsigned long arg);
303extern const struct file_operations fat_file_operations; 306extern const struct file_operations fat_file_operations;
304extern const struct inode_operations fat_file_inode_operations; 307extern const struct inode_operations fat_file_inode_operations;
305extern int fat_setattr(struct dentry * dentry, struct iattr * attr); 308extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
306extern void fat_truncate(struct inode *inode); 309extern int fat_setsize(struct inode *inode, loff_t offset);
310extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
307extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, 311extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
308 struct kstat *stat); 312 struct kstat *stat);
309extern int fat_file_fsync(struct file *file, struct dentry *dentry, 313extern int fat_file_fsync(struct file *file, int datasync);
310 int datasync);
311 314
312/* fat/inode.c */ 315/* fat/inode.c */
313extern void fat_attach(struct inode *inode, loff_t i_pos); 316extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323 struct inode *i2); 326 struct inode *i2);
324/* fat/misc.c */ 327/* fat/misc.c */
325extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 328extern void
326 __attribute__ ((format (printf, 2, 3))) __cold; 329__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
330 __attribute__ ((format (printf, 3, 4))) __cold;
331#define fat_fs_error(s, fmt, args...) \
332 __fat_fs_error(s, 1, fmt , ## args)
333#define fat_fs_error_ratelimit(s, fmt, args...) \
334 __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
327extern int fat_clusters_flush(struct super_block *sb); 335extern int fat_clusters_flush(struct super_block *sb);
328extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 336extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
329extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 337extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..990dfae022e5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/capability.h> 9#include <linux/capability.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/compat.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/time.h> 13#include <linux/time.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
114 return err; 115 return err;
115} 116}
116 117
117int fat_generic_ioctl(struct inode *inode, struct file *filp, 118long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118 unsigned int cmd, unsigned long arg)
119{ 119{
120 struct inode *inode = filp->f_path.dentry->d_inode;
120 u32 __user *user_attr = (u32 __user *)arg; 121 u32 __user *user_attr = (u32 __user *)arg;
121 122
122 switch (cmd) { 123 switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
129 } 130 }
130} 131}
131 132
133#ifdef CONFIG_COMPAT
134static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
135 unsigned long arg)
136
137{
138 return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
139}
140#endif
141
132static int fat_file_release(struct inode *inode, struct file *filp) 142static int fat_file_release(struct inode *inode, struct file *filp)
133{ 143{
134 if ((filp->f_mode & FMODE_WRITE) && 144 if ((filp->f_mode & FMODE_WRITE) &&
@@ -139,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
139 return 0; 149 return 0;
140} 150}
141 151
142int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 152int fat_file_fsync(struct file *filp, int datasync)
143{ 153{
144 struct inode *inode = dentry->d_inode; 154 struct inode *inode = filp->f_mapping->host;
145 int res, err; 155 int res, err;
146 156
147 res = simple_fsync(filp, dentry, datasync); 157 res = generic_file_fsync(filp, datasync);
148 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); 158 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
149 159
150 return res ? res : err; 160 return res ? res : err;
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
159 .aio_write = generic_file_aio_write, 169 .aio_write = generic_file_aio_write,
160 .mmap = generic_file_mmap, 170 .mmap = generic_file_mmap,
161 .release = fat_file_release, 171 .release = fat_file_release,
162 .ioctl = fat_generic_ioctl, 172 .unlocked_ioctl = fat_generic_ioctl,
173#ifdef CONFIG_COMPAT
174 .compat_ioctl = fat_generic_compat_ioctl,
175#endif
163 .fsync = fat_file_fsync, 176 .fsync = fat_file_fsync,
164 .splice_read = generic_file_splice_read, 177 .splice_read = generic_file_splice_read,
165}; 178};
@@ -270,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
270 return fat_free_clusters(inode, free_start); 283 return fat_free_clusters(inode, free_start);
271} 284}
272 285
273void fat_truncate(struct inode *inode) 286void fat_truncate_blocks(struct inode *inode, loff_t offset)
274{ 287{
275 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 288 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
276 const unsigned int cluster_size = sbi->cluster_size; 289 const unsigned int cluster_size = sbi->cluster_size;
@@ -280,10 +293,10 @@ void fat_truncate(struct inode *inode)
280 * This protects against truncating a file bigger than it was then 293 * This protects against truncating a file bigger than it was then
281 * trying to write into the hole. 294 * trying to write into the hole.
282 */ 295 */
283 if (MSDOS_I(inode)->mmu_private > inode->i_size) 296 if (MSDOS_I(inode)->mmu_private > offset)
284 MSDOS_I(inode)->mmu_private = inode->i_size; 297 MSDOS_I(inode)->mmu_private = offset;
285 298
286 nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; 299 nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
287 300
288 fat_free(inode, nr_clusters); 301 fat_free(inode, nr_clusters);
289 fat_flush_inodes(inode->i_sb, inode, NULL); 302 fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -351,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
351 return 0; 364 return 0;
352} 365}
353 366
367int fat_setsize(struct inode *inode, loff_t offset)
368{
369 int error;
370
371 error = simple_setsize(inode, offset);
372 if (error)
373 return error;
374 fat_truncate_blocks(inode, offset);
375
376 return error;
377}
378
354#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) 379#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
355/* valid file mode bits */ 380/* valid file mode bits */
356#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) 381#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -365,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
365 /* 390 /*
366 * Expand the file. Since inode_setattr() updates ->i_size 391 * Expand the file. Since inode_setattr() updates ->i_size
367 * before calling the ->truncate(), but FAT needs to fill the 392 * before calling the ->truncate(), but FAT needs to fill the
368 * hole before it. 393 * hole before it. XXX: this is no longer true with new truncate
394 * sequence.
369 */ 395 */
370 if (attr->ia_valid & ATTR_SIZE) { 396 if (attr->ia_valid & ATTR_SIZE) {
371 if (attr->ia_size > inode->i_size) { 397 if (attr->ia_size > inode->i_size) {
@@ -414,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
414 attr->ia_valid &= ~ATTR_MODE; 440 attr->ia_valid &= ~ATTR_MODE;
415 } 441 }
416 442
417 if (attr->ia_valid) 443 if (attr->ia_valid & ATTR_SIZE) {
418 error = inode_setattr(inode, attr); 444 error = fat_setsize(inode, attr->ia_size);
445 if (error)
446 goto out;
447 }
448
449 generic_setattr(inode, attr);
450 mark_inode_dirty(inode);
419out: 451out:
420 return error; 452 return error;
421} 453}
422EXPORT_SYMBOL_GPL(fat_setattr); 454EXPORT_SYMBOL_GPL(fat_setattr);
423 455
424const struct inode_operations fat_file_inode_operations = { 456const struct inode_operations fat_file_inode_operations = {
425 .truncate = fat_truncate,
426 .setattr = fat_setattr, 457 .setattr = fat_setattr,
427 .getattr = fat_getattr, 458 .getattr = fat_getattr,
428}; 459};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..7bf45aee56d7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block); 142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
143} 143}
144 144
145static void fat_write_failed(struct address_space *mapping, loff_t to)
146{
147 struct inode *inode = mapping->host;
148
149 if (to > inode->i_size) {
150 truncate_pagecache(inode, to, inode->i_size);
151 fat_truncate_blocks(inode, inode->i_size);
152 }
153}
154
145static int fat_write_begin(struct file *file, struct address_space *mapping, 155static int fat_write_begin(struct file *file, struct address_space *mapping,
146 loff_t pos, unsigned len, unsigned flags, 156 loff_t pos, unsigned len, unsigned flags,
147 struct page **pagep, void **fsdata) 157 struct page **pagep, void **fsdata)
148{ 158{
159 int err;
160
149 *pagep = NULL; 161 *pagep = NULL;
150 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 162 err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
151 fat_get_block, 163 pagep, fsdata, fat_get_block,
152 &MSDOS_I(mapping->host)->mmu_private); 164 &MSDOS_I(mapping->host)->mmu_private);
165 if (err < 0)
166 fat_write_failed(mapping, pos + len);
167 return err;
153} 168}
154 169
155static int fat_write_end(struct file *file, struct address_space *mapping, 170static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
159 struct inode *inode = mapping->host; 174 struct inode *inode = mapping->host;
160 int err; 175 int err;
161 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); 176 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
177 if (err < len)
178 fat_write_failed(mapping, pos + len);
162 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { 179 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
163 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 180 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
164 MSDOS_I(inode)->i_attrs |= ATTR_ARCH; 181 MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
172 loff_t offset, unsigned long nr_segs) 189 loff_t offset, unsigned long nr_segs)
173{ 190{
174 struct file *file = iocb->ki_filp; 191 struct file *file = iocb->ki_filp;
175 struct inode *inode = file->f_mapping->host; 192 struct address_space *mapping = file->f_mapping;
193 struct inode *inode = mapping->host;
194 ssize_t ret;
176 195
177 if (rw == WRITE) { 196 if (rw == WRITE) {
178 /* 197 /*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
193 * FAT need to use the DIO_LOCKING for avoiding the race 212 * FAT need to use the DIO_LOCKING for avoiding the race
194 * condition of fat_get_block() and ->truncate(). 213 * condition of fat_get_block() and ->truncate().
195 */ 214 */
196 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 215 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
197 offset, nr_segs, fat_get_block, NULL); 216 iov, offset, nr_segs, fat_get_block, NULL);
217 if (ret < 0 && (rw & WRITE))
218 fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
219
220 return ret;
198} 221}
199 222
200static sector_t _fat_bmap(struct address_space *mapping, sector_t block) 223static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode)
429{ 452{
430 truncate_inode_pages(&inode->i_data, 0); 453 truncate_inode_pages(&inode->i_data, 0);
431 inode->i_size = 0; 454 inode->i_size = 0;
432 fat_truncate(inode); 455 fat_truncate_blocks(inode, 0);
433 clear_inode(inode); 456 clear_inode(inode);
434} 457}
435 458
@@ -1250,6 +1273,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1250 sb->s_op = &fat_sops; 1273 sb->s_op = &fat_sops;
1251 sb->s_export_op = &fat_export_ops; 1274 sb->s_export_op = &fat_export_ops;
1252 sbi->dir_ops = fs_dir_inode_ops; 1275 sbi->dir_ops = fs_dir_inode_ops;
1276 ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
1277 DEFAULT_RATELIMIT_BURST);
1253 1278
1254 error = parse_options(data, isvfat, silent, &debug, &sbi->options); 1279 error = parse_options(data, isvfat, silent, &debug, &sbi->options);
1255 if (error) 1280 if (error)
@@ -1497,10 +1522,8 @@ out_fail:
1497 iput(fat_inode); 1522 iput(fat_inode);
1498 if (root_inode) 1523 if (root_inode)
1499 iput(root_inode); 1524 iput(root_inode);
1500 if (sbi->nls_io) 1525 unload_nls(sbi->nls_io);
1501 unload_nls(sbi->nls_io); 1526 unload_nls(sbi->nls_disk);
1502 if (sbi->nls_disk)
1503 unload_nls(sbi->nls_disk);
1504 if (sbi->options.iocharset != fat_default_iocharset) 1527 if (sbi->options.iocharset != fat_default_iocharset)
1505 kfree(sbi->options.iocharset); 1528 kfree(sbi->options.iocharset);
1506 sb->s_fs_info = NULL; 1529 sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
20 * In case the file system is remounted read-only, it can be made writable 20 * In case the file system is remounted read-only, it can be made writable
21 * again by remounting it. 21 * again by remounting it.
22 */ 22 */
23void fat_fs_error(struct super_block *s, const char *fmt, ...) 23void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
24{ 24{
25 struct fat_mount_options *opts = &MSDOS_SB(s)->options; 25 struct fat_mount_options *opts = &MSDOS_SB(s)->options;
26 va_list args; 26 va_list args;
27 27
28 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); 28 if (report) {
29 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
29 30
30 printk(KERN_ERR " "); 31 printk(KERN_ERR " ");
31 va_start(args, fmt); 32 va_start(args, fmt);
32 vprintk(fmt, args); 33 vprintk(fmt, args);
33 va_end(args); 34 va_end(args);
34 printk("\n"); 35 printk("\n");
36 }
35 37
36 if (opts->errors == FAT_ERRORS_PANIC) 38 if (opts->errors == FAT_ERRORS_PANIC)
37 panic(" FAT fs panic from previous error\n"); 39 panic("FAT: fs panic from previous error\n");
38 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { 40 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
39 s->s_flags |= MS_RDONLY; 41 s->s_flags |= MS_RDONLY;
40 printk(KERN_ERR " File system has been set read-only\n"); 42 printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
41 } 43 }
42} 44}
43EXPORT_SYMBOL_GPL(fat_fs_error); 45EXPORT_SYMBOL_GPL(__fat_fs_error);
44 46
45/* Flushes the number of free clusters on FAT32 */ 47/* Flushes the number of free clusters on FAT32 */
46/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 48/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..5c7d10ead4ad 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
194} 194}
195EXPORT_SYMBOL(alloc_file); 195EXPORT_SYMBOL(alloc_file);
196 196
197void fput(struct file *file)
198{
199 if (atomic_long_dec_and_test(&file->f_count))
200 __fput(file);
201}
202
203EXPORT_SYMBOL(fput);
204
205/** 197/**
206 * drop_file_write_access - give up ability to write to a file 198 * drop_file_write_access - give up ability to write to a file
207 * @file: the file to which we will stop writing 199 * @file: the file to which we will stop writing
@@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file)
227} 219}
228EXPORT_SYMBOL_GPL(drop_file_write_access); 220EXPORT_SYMBOL_GPL(drop_file_write_access);
229 221
230/* __fput is called from task context when aio completion releases the last 222/* the real guts of fput() - releasing the last reference to file
231 * last use of a struct file *. Do not use otherwise.
232 */ 223 */
233void __fput(struct file *file) 224static void __fput(struct file *file)
234{ 225{
235 struct dentry *dentry = file->f_path.dentry; 226 struct dentry *dentry = file->f_path.dentry;
236 struct vfsmount *mnt = file->f_path.mnt; 227 struct vfsmount *mnt = file->f_path.mnt;
@@ -268,6 +259,14 @@ void __fput(struct file *file)
268 mntput(mnt); 259 mntput(mnt);
269} 260}
270 261
262void fput(struct file *file)
263{
264 if (atomic_long_dec_and_test(&file->f_count))
265 __fput(file);
266}
267
268EXPORT_SYMBOL(fput);
269
271struct file *fget(unsigned int fd) 270struct file *fget(unsigned int fd)
272{ 271{
273 struct file *file; 272 struct file *file;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
57}; 57};
58 58
59const struct file_operations vxfs_dir_operations = { 59const struct file_operations vxfs_dir_operations = {
60 .llseek = generic_file_llseek,
61 .read = generic_read_dir,
60 .readdir = vxfs_readdir, 62 .readdir = vxfs_readdir,
61}; 63};
62 64
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 408a7877b79d..1d1088f48bc2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -398,11 +398,11 @@ static void inode_wait_for_writeback(struct inode *inode)
398 wait_queue_head_t *wqh; 398 wait_queue_head_t *wqh;
399 399
400 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 400 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
401 do { 401 while (inode->i_state & I_SYNC) {
402 spin_unlock(&inode_lock); 402 spin_unlock(&inode_lock);
403 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 403 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
404 spin_lock(&inode_lock); 404 spin_lock(&inode_lock);
405 } while (inode->i_state & I_SYNC); 405 }
406} 406}
407 407
408/* 408/*
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
103 /* banners (can't represent line 0 by pos 0 as that would involve 103 /* banners (can't represent line 0 by pos 0 as that would involve
104 * returning a NULL pointer) */ 104 * returning a NULL pointer) */
105 if (pos == 0) 105 if (pos == 0)
106 return (struct fscache_object *) ++(*_pos); 106 return (struct fscache_object *)(long)++(*_pos);
107 if (pos < 3) 107 if (pos < 3)
108 return (struct fscache_object *)pos; 108 return (struct fscache_object *)pos;
109 109
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691f..9424796d6634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,8 +16,12 @@
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/file.h> 17#include <linux/file.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/pipe_fs_i.h>
20#include <linux/swap.h>
21#include <linux/splice.h>
19 22
20MODULE_ALIAS_MISCDEV(FUSE_MINOR); 23MODULE_ALIAS_MISCDEV(FUSE_MINOR);
24MODULE_ALIAS("devname:fuse");
21 25
22static struct kmem_cache *fuse_req_cachep; 26static struct kmem_cache *fuse_req_cachep;
23 27
@@ -498,6 +502,9 @@ struct fuse_copy_state {
498 int write; 502 int write;
499 struct fuse_req *req; 503 struct fuse_req *req;
500 const struct iovec *iov; 504 const struct iovec *iov;
505 struct pipe_buffer *pipebufs;
506 struct pipe_buffer *currbuf;
507 struct pipe_inode_info *pipe;
501 unsigned long nr_segs; 508 unsigned long nr_segs;
502 unsigned long seglen; 509 unsigned long seglen;
503 unsigned long addr; 510 unsigned long addr;
@@ -505,16 +512,16 @@ struct fuse_copy_state {
505 void *mapaddr; 512 void *mapaddr;
506 void *buf; 513 void *buf;
507 unsigned len; 514 unsigned len;
515 unsigned move_pages:1;
508}; 516};
509 517
510static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, 518static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
511 int write, struct fuse_req *req, 519 int write,
512 const struct iovec *iov, unsigned long nr_segs) 520 const struct iovec *iov, unsigned long nr_segs)
513{ 521{
514 memset(cs, 0, sizeof(*cs)); 522 memset(cs, 0, sizeof(*cs));
515 cs->fc = fc; 523 cs->fc = fc;
516 cs->write = write; 524 cs->write = write;
517 cs->req = req;
518 cs->iov = iov; 525 cs->iov = iov;
519 cs->nr_segs = nr_segs; 526 cs->nr_segs = nr_segs;
520} 527}
@@ -522,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
522/* Unmap and put previous page of userspace buffer */ 529/* Unmap and put previous page of userspace buffer */
523static void fuse_copy_finish(struct fuse_copy_state *cs) 530static void fuse_copy_finish(struct fuse_copy_state *cs)
524{ 531{
525 if (cs->mapaddr) { 532 if (cs->currbuf) {
533 struct pipe_buffer *buf = cs->currbuf;
534
535 if (!cs->write) {
536 buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
537 } else {
538 kunmap_atomic(cs->mapaddr, KM_USER0);
539 buf->len = PAGE_SIZE - cs->len;
540 }
541 cs->currbuf = NULL;
542 cs->mapaddr = NULL;
543 } else if (cs->mapaddr) {
526 kunmap_atomic(cs->mapaddr, KM_USER0); 544 kunmap_atomic(cs->mapaddr, KM_USER0);
527 if (cs->write) { 545 if (cs->write) {
528 flush_dcache_page(cs->pg); 546 flush_dcache_page(cs->pg);
@@ -544,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
544 562
545 unlock_request(cs->fc, cs->req); 563 unlock_request(cs->fc, cs->req);
546 fuse_copy_finish(cs); 564 fuse_copy_finish(cs);
547 if (!cs->seglen) { 565 if (cs->pipebufs) {
548 BUG_ON(!cs->nr_segs); 566 struct pipe_buffer *buf = cs->pipebufs;
549 cs->seglen = cs->iov[0].iov_len; 567
550 cs->addr = (unsigned long) cs->iov[0].iov_base; 568 if (!cs->write) {
551 cs->iov++; 569 err = buf->ops->confirm(cs->pipe, buf);
552 cs->nr_segs--; 570 if (err)
571 return err;
572
573 BUG_ON(!cs->nr_segs);
574 cs->currbuf = buf;
575 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
576 cs->len = buf->len;
577 cs->buf = cs->mapaddr + buf->offset;
578 cs->pipebufs++;
579 cs->nr_segs--;
580 } else {
581 struct page *page;
582
583 if (cs->nr_segs == cs->pipe->buffers)
584 return -EIO;
585
586 page = alloc_page(GFP_HIGHUSER);
587 if (!page)
588 return -ENOMEM;
589
590 buf->page = page;
591 buf->offset = 0;
592 buf->len = 0;
593
594 cs->currbuf = buf;
595 cs->mapaddr = kmap_atomic(page, KM_USER0);
596 cs->buf = cs->mapaddr;
597 cs->len = PAGE_SIZE;
598 cs->pipebufs++;
599 cs->nr_segs++;
600 }
601 } else {
602 if (!cs->seglen) {
603 BUG_ON(!cs->nr_segs);
604 cs->seglen = cs->iov[0].iov_len;
605 cs->addr = (unsigned long) cs->iov[0].iov_base;
606 cs->iov++;
607 cs->nr_segs--;
608 }
609 err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
610 if (err < 0)
611 return err;
612 BUG_ON(err != 1);
613 offset = cs->addr % PAGE_SIZE;
614 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
615 cs->buf = cs->mapaddr + offset;
616 cs->len = min(PAGE_SIZE - offset, cs->seglen);
617 cs->seglen -= cs->len;
618 cs->addr += cs->len;
553 } 619 }
554 down_read(&current->mm->mmap_sem);
555 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
556 &cs->pg, NULL);
557 up_read(&current->mm->mmap_sem);
558 if (err < 0)
559 return err;
560 BUG_ON(err != 1);
561 offset = cs->addr % PAGE_SIZE;
562 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
563 cs->buf = cs->mapaddr + offset;
564 cs->len = min(PAGE_SIZE - offset, cs->seglen);
565 cs->seglen -= cs->len;
566 cs->addr += cs->len;
567 620
568 return lock_request(cs->fc, cs->req); 621 return lock_request(cs->fc, cs->req);
569} 622}
@@ -585,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
585 return ncpy; 638 return ncpy;
586} 639}
587 640
641static int fuse_check_page(struct page *page)
642{
643 if (page_mapcount(page) ||
644 page->mapping != NULL ||
645 page_count(page) != 1 ||
646 (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
647 ~(1 << PG_locked |
648 1 << PG_referenced |
649 1 << PG_uptodate |
650 1 << PG_lru |
651 1 << PG_active |
652 1 << PG_reclaim))) {
653 printk(KERN_WARNING "fuse: trying to steal weird page\n");
654 printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
655 return 1;
656 }
657 return 0;
658}
659
660static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
661{
662 int err;
663 struct page *oldpage = *pagep;
664 struct page *newpage;
665 struct pipe_buffer *buf = cs->pipebufs;
666 struct address_space *mapping;
667 pgoff_t index;
668
669 unlock_request(cs->fc, cs->req);
670 fuse_copy_finish(cs);
671
672 err = buf->ops->confirm(cs->pipe, buf);
673 if (err)
674 return err;
675
676 BUG_ON(!cs->nr_segs);
677 cs->currbuf = buf;
678 cs->len = buf->len;
679 cs->pipebufs++;
680 cs->nr_segs--;
681
682 if (cs->len != PAGE_SIZE)
683 goto out_fallback;
684
685 if (buf->ops->steal(cs->pipe, buf) != 0)
686 goto out_fallback;
687
688 newpage = buf->page;
689
690 if (WARN_ON(!PageUptodate(newpage)))
691 return -EIO;
692
693 ClearPageMappedToDisk(newpage);
694
695 if (fuse_check_page(newpage) != 0)
696 goto out_fallback_unlock;
697
698 mapping = oldpage->mapping;
699 index = oldpage->index;
700
701 /*
702 * This is a new and locked page, it shouldn't be mapped or
703 * have any special flags on it
704 */
705 if (WARN_ON(page_mapped(oldpage)))
706 goto out_fallback_unlock;
707 if (WARN_ON(page_has_private(oldpage)))
708 goto out_fallback_unlock;
709 if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
710 goto out_fallback_unlock;
711 if (WARN_ON(PageMlocked(oldpage)))
712 goto out_fallback_unlock;
713
714 remove_from_page_cache(oldpage);
715 page_cache_release(oldpage);
716
717 err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
718 if (err) {
719 printk(KERN_WARNING "fuse_try_move_page: failed to add page");
720 goto out_fallback_unlock;
721 }
722 page_cache_get(newpage);
723
724 if (!(buf->flags & PIPE_BUF_FLAG_LRU))
725 lru_cache_add_file(newpage);
726
727 err = 0;
728 spin_lock(&cs->fc->lock);
729 if (cs->req->aborted)
730 err = -ENOENT;
731 else
732 *pagep = newpage;
733 spin_unlock(&cs->fc->lock);
734
735 if (err) {
736 unlock_page(newpage);
737 page_cache_release(newpage);
738 return err;
739 }
740
741 unlock_page(oldpage);
742 page_cache_release(oldpage);
743 cs->len = 0;
744
745 return 0;
746
747out_fallback_unlock:
748 unlock_page(newpage);
749out_fallback:
750 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
751 cs->buf = cs->mapaddr + buf->offset;
752
753 err = lock_request(cs->fc, cs->req);
754 if (err)
755 return err;
756
757 return 1;
758}
759
760static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
761 unsigned offset, unsigned count)
762{
763 struct pipe_buffer *buf;
764
765 if (cs->nr_segs == cs->pipe->buffers)
766 return -EIO;
767
768 unlock_request(cs->fc, cs->req);
769 fuse_copy_finish(cs);
770
771 buf = cs->pipebufs;
772 page_cache_get(page);
773 buf->page = page;
774 buf->offset = offset;
775 buf->len = count;
776
777 cs->pipebufs++;
778 cs->nr_segs++;
779 cs->len = 0;
780
781 return 0;
782}
783
588/* 784/*
589 * Copy a page in the request to/from the userspace buffer. Must be 785 * Copy a page in the request to/from the userspace buffer. Must be
590 * done atomically 786 * done atomically
591 */ 787 */
592static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, 788static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
593 unsigned offset, unsigned count, int zeroing) 789 unsigned offset, unsigned count, int zeroing)
594{ 790{
791 int err;
792 struct page *page = *pagep;
793
595 if (page && zeroing && count < PAGE_SIZE) { 794 if (page && zeroing && count < PAGE_SIZE) {
596 void *mapaddr = kmap_atomic(page, KM_USER1); 795 void *mapaddr = kmap_atomic(page, KM_USER1);
597 memset(mapaddr, 0, PAGE_SIZE); 796 memset(mapaddr, 0, PAGE_SIZE);
598 kunmap_atomic(mapaddr, KM_USER1); 797 kunmap_atomic(mapaddr, KM_USER1);
599 } 798 }
600 while (count) { 799 while (count) {
601 if (!cs->len) { 800 if (cs->write && cs->pipebufs && page) {
602 int err = fuse_copy_fill(cs); 801 return fuse_ref_page(cs, page, offset, count);
603 if (err) 802 } else if (!cs->len) {
604 return err; 803 if (cs->move_pages && page &&
804 offset == 0 && count == PAGE_SIZE) {
805 err = fuse_try_move_page(cs, pagep);
806 if (err <= 0)
807 return err;
808 } else {
809 err = fuse_copy_fill(cs);
810 if (err)
811 return err;
812 }
605 } 813 }
606 if (page) { 814 if (page) {
607 void *mapaddr = kmap_atomic(page, KM_USER1); 815 void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -626,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
626 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); 834 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
627 835
628 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { 836 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
629 struct page *page = req->pages[i]; 837 int err;
630 int err = fuse_copy_page(cs, page, offset, count, zeroing); 838
839 err = fuse_copy_page(cs, &req->pages[i], offset, count,
840 zeroing);
631 if (err) 841 if (err)
632 return err; 842 return err;
633 843
@@ -704,11 +914,10 @@ __acquires(&fc->lock)
704 * 914 *
705 * Called with fc->lock held, releases it 915 * Called with fc->lock held, releases it
706 */ 916 */
707static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, 917static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
708 const struct iovec *iov, unsigned long nr_segs) 918 size_t nbytes, struct fuse_req *req)
709__releases(&fc->lock) 919__releases(&fc->lock)
710{ 920{
711 struct fuse_copy_state cs;
712 struct fuse_in_header ih; 921 struct fuse_in_header ih;
713 struct fuse_interrupt_in arg; 922 struct fuse_interrupt_in arg;
714 unsigned reqsize = sizeof(ih) + sizeof(arg); 923 unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -724,14 +933,13 @@ __releases(&fc->lock)
724 arg.unique = req->in.h.unique; 933 arg.unique = req->in.h.unique;
725 934
726 spin_unlock(&fc->lock); 935 spin_unlock(&fc->lock);
727 if (iov_length(iov, nr_segs) < reqsize) 936 if (nbytes < reqsize)
728 return -EINVAL; 937 return -EINVAL;
729 938
730 fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs); 939 err = fuse_copy_one(cs, &ih, sizeof(ih));
731 err = fuse_copy_one(&cs, &ih, sizeof(ih));
732 if (!err) 940 if (!err)
733 err = fuse_copy_one(&cs, &arg, sizeof(arg)); 941 err = fuse_copy_one(cs, &arg, sizeof(arg));
734 fuse_copy_finish(&cs); 942 fuse_copy_finish(cs);
735 943
736 return err ? err : reqsize; 944 return err ? err : reqsize;
737} 945}
@@ -745,18 +953,13 @@ __releases(&fc->lock)
745 * request_end(). Otherwise add it to the processing list, and set 953 * request_end(). Otherwise add it to the processing list, and set
746 * the 'sent' flag. 954 * the 'sent' flag.
747 */ 955 */
748static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, 956static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
749 unsigned long nr_segs, loff_t pos) 957 struct fuse_copy_state *cs, size_t nbytes)
750{ 958{
751 int err; 959 int err;
752 struct fuse_req *req; 960 struct fuse_req *req;
753 struct fuse_in *in; 961 struct fuse_in *in;
754 struct fuse_copy_state cs;
755 unsigned reqsize; 962 unsigned reqsize;
756 struct file *file = iocb->ki_filp;
757 struct fuse_conn *fc = fuse_get_conn(file);
758 if (!fc)
759 return -EPERM;
760 963
761 restart: 964 restart:
762 spin_lock(&fc->lock); 965 spin_lock(&fc->lock);
@@ -776,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
776 if (!list_empty(&fc->interrupts)) { 979 if (!list_empty(&fc->interrupts)) {
777 req = list_entry(fc->interrupts.next, struct fuse_req, 980 req = list_entry(fc->interrupts.next, struct fuse_req,
778 intr_entry); 981 intr_entry);
779 return fuse_read_interrupt(fc, req, iov, nr_segs); 982 return fuse_read_interrupt(fc, cs, nbytes, req);
780 } 983 }
781 984
782 req = list_entry(fc->pending.next, struct fuse_req, list); 985 req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -786,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
786 in = &req->in; 989 in = &req->in;
787 reqsize = in->h.len; 990 reqsize = in->h.len;
788 /* If request is too large, reply with an error and restart the read */ 991 /* If request is too large, reply with an error and restart the read */
789 if (iov_length(iov, nr_segs) < reqsize) { 992 if (nbytes < reqsize) {
790 req->out.h.error = -EIO; 993 req->out.h.error = -EIO;
791 /* SETXATTR is special, since it may contain too large data */ 994 /* SETXATTR is special, since it may contain too large data */
792 if (in->h.opcode == FUSE_SETXATTR) 995 if (in->h.opcode == FUSE_SETXATTR)
@@ -795,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
795 goto restart; 998 goto restart;
796 } 999 }
797 spin_unlock(&fc->lock); 1000 spin_unlock(&fc->lock);
798 fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); 1001 cs->req = req;
799 err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); 1002 err = fuse_copy_one(cs, &in->h, sizeof(in->h));
800 if (!err) 1003 if (!err)
801 err = fuse_copy_args(&cs, in->numargs, in->argpages, 1004 err = fuse_copy_args(cs, in->numargs, in->argpages,
802 (struct fuse_arg *) in->args, 0); 1005 (struct fuse_arg *) in->args, 0);
803 fuse_copy_finish(&cs); 1006 fuse_copy_finish(cs);
804 spin_lock(&fc->lock); 1007 spin_lock(&fc->lock);
805 req->locked = 0; 1008 req->locked = 0;
806 if (req->aborted) { 1009 if (req->aborted) {
@@ -828,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
828 return err; 1031 return err;
829} 1032}
830 1033
1034static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1035 unsigned long nr_segs, loff_t pos)
1036{
1037 struct fuse_copy_state cs;
1038 struct file *file = iocb->ki_filp;
1039 struct fuse_conn *fc = fuse_get_conn(file);
1040 if (!fc)
1041 return -EPERM;
1042
1043 fuse_copy_init(&cs, fc, 1, iov, nr_segs);
1044
1045 return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
1046}
1047
1048static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
1049 struct pipe_buffer *buf)
1050{
1051 return 1;
1052}
1053
1054static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
1055 .can_merge = 0,
1056 .map = generic_pipe_buf_map,
1057 .unmap = generic_pipe_buf_unmap,
1058 .confirm = generic_pipe_buf_confirm,
1059 .release = generic_pipe_buf_release,
1060 .steal = fuse_dev_pipe_buf_steal,
1061 .get = generic_pipe_buf_get,
1062};
1063
1064static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1065 struct pipe_inode_info *pipe,
1066 size_t len, unsigned int flags)
1067{
1068 int ret;
1069 int page_nr = 0;
1070 int do_wakeup = 0;
1071 struct pipe_buffer *bufs;
1072 struct fuse_copy_state cs;
1073 struct fuse_conn *fc = fuse_get_conn(in);
1074 if (!fc)
1075 return -EPERM;
1076
1077 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1078 if (!bufs)
1079 return -ENOMEM;
1080
1081 fuse_copy_init(&cs, fc, 1, NULL, 0);
1082 cs.pipebufs = bufs;
1083 cs.pipe = pipe;
1084 ret = fuse_dev_do_read(fc, in, &cs, len);
1085 if (ret < 0)
1086 goto out;
1087
1088 ret = 0;
1089 pipe_lock(pipe);
1090
1091 if (!pipe->readers) {
1092 send_sig(SIGPIPE, current, 0);
1093 if (!ret)
1094 ret = -EPIPE;
1095 goto out_unlock;
1096 }
1097
1098 if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
1099 ret = -EIO;
1100 goto out_unlock;
1101 }
1102
1103 while (page_nr < cs.nr_segs) {
1104 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
1105 struct pipe_buffer *buf = pipe->bufs + newbuf;
1106
1107 buf->page = bufs[page_nr].page;
1108 buf->offset = bufs[page_nr].offset;
1109 buf->len = bufs[page_nr].len;
1110 buf->ops = &fuse_dev_pipe_buf_ops;
1111
1112 pipe->nrbufs++;
1113 page_nr++;
1114 ret += buf->len;
1115
1116 if (pipe->inode)
1117 do_wakeup = 1;
1118 }
1119
1120out_unlock:
1121 pipe_unlock(pipe);
1122
1123 if (do_wakeup) {
1124 smp_mb();
1125 if (waitqueue_active(&pipe->wait))
1126 wake_up_interruptible(&pipe->wait);
1127 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1128 }
1129
1130out:
1131 for (; page_nr < cs.nr_segs; page_nr++)
1132 page_cache_release(bufs[page_nr].page);
1133
1134 kfree(bufs);
1135 return ret;
1136}
1137
831static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, 1138static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
832 struct fuse_copy_state *cs) 1139 struct fuse_copy_state *cs)
833{ 1140{
@@ -987,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
987 * it from the list and copy the rest of the buffer to the request. 1294 * it from the list and copy the rest of the buffer to the request.
988 * The request is finished by calling request_end() 1295 * The request is finished by calling request_end()
989 */ 1296 */
990static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, 1297static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
991 unsigned long nr_segs, loff_t pos) 1298 struct fuse_copy_state *cs, size_t nbytes)
992{ 1299{
993 int err; 1300 int err;
994 size_t nbytes = iov_length(iov, nr_segs);
995 struct fuse_req *req; 1301 struct fuse_req *req;
996 struct fuse_out_header oh; 1302 struct fuse_out_header oh;
997 struct fuse_copy_state cs;
998 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
999 if (!fc)
1000 return -EPERM;
1001 1303
1002 fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
1003 if (nbytes < sizeof(struct fuse_out_header)) 1304 if (nbytes < sizeof(struct fuse_out_header))
1004 return -EINVAL; 1305 return -EINVAL;
1005 1306
1006 err = fuse_copy_one(&cs, &oh, sizeof(oh)); 1307 err = fuse_copy_one(cs, &oh, sizeof(oh));
1007 if (err) 1308 if (err)
1008 goto err_finish; 1309 goto err_finish;
1009 1310
@@ -1016,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1016 * and error contains notification code. 1317 * and error contains notification code.
1017 */ 1318 */
1018 if (!oh.unique) { 1319 if (!oh.unique) {
1019 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); 1320 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
1020 return err ? err : nbytes; 1321 return err ? err : nbytes;
1021 } 1322 }
1022 1323
@@ -1035,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1035 1336
1036 if (req->aborted) { 1337 if (req->aborted) {
1037 spin_unlock(&fc->lock); 1338 spin_unlock(&fc->lock);
1038 fuse_copy_finish(&cs); 1339 fuse_copy_finish(cs);
1039 spin_lock(&fc->lock); 1340 spin_lock(&fc->lock);
1040 request_end(fc, req); 1341 request_end(fc, req);
1041 return -ENOENT; 1342 return -ENOENT;
@@ -1052,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1052 queue_interrupt(fc, req); 1353 queue_interrupt(fc, req);
1053 1354
1054 spin_unlock(&fc->lock); 1355 spin_unlock(&fc->lock);
1055 fuse_copy_finish(&cs); 1356 fuse_copy_finish(cs);
1056 return nbytes; 1357 return nbytes;
1057 } 1358 }
1058 1359
@@ -1060,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1060 list_move(&req->list, &fc->io); 1361 list_move(&req->list, &fc->io);
1061 req->out.h = oh; 1362 req->out.h = oh;
1062 req->locked = 1; 1363 req->locked = 1;
1063 cs.req = req; 1364 cs->req = req;
1365 if (!req->out.page_replace)
1366 cs->move_pages = 0;
1064 spin_unlock(&fc->lock); 1367 spin_unlock(&fc->lock);
1065 1368
1066 err = copy_out_args(&cs, &req->out, nbytes); 1369 err = copy_out_args(cs, &req->out, nbytes);
1067 fuse_copy_finish(&cs); 1370 fuse_copy_finish(cs);
1068 1371
1069 spin_lock(&fc->lock); 1372 spin_lock(&fc->lock);
1070 req->locked = 0; 1373 req->locked = 0;
@@ -1080,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1080 err_unlock: 1383 err_unlock:
1081 spin_unlock(&fc->lock); 1384 spin_unlock(&fc->lock);
1082 err_finish: 1385 err_finish:
1083 fuse_copy_finish(&cs); 1386 fuse_copy_finish(cs);
1084 return err; 1387 return err;
1085} 1388}
1086 1389
1390static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1391 unsigned long nr_segs, loff_t pos)
1392{
1393 struct fuse_copy_state cs;
1394 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
1395 if (!fc)
1396 return -EPERM;
1397
1398 fuse_copy_init(&cs, fc, 0, iov, nr_segs);
1399
1400 return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
1401}
1402
1403static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1404 struct file *out, loff_t *ppos,
1405 size_t len, unsigned int flags)
1406{
1407 unsigned nbuf;
1408 unsigned idx;
1409 struct pipe_buffer *bufs;
1410 struct fuse_copy_state cs;
1411 struct fuse_conn *fc;
1412 size_t rem;
1413 ssize_t ret;
1414
1415 fc = fuse_get_conn(out);
1416 if (!fc)
1417 return -EPERM;
1418
1419 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1420 if (!bufs)
1421 return -ENOMEM;
1422
1423 pipe_lock(pipe);
1424 nbuf = 0;
1425 rem = 0;
1426 for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
1427 rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
1428
1429 ret = -EINVAL;
1430 if (rem < len) {
1431 pipe_unlock(pipe);
1432 goto out;
1433 }
1434
1435 rem = len;
1436 while (rem) {
1437 struct pipe_buffer *ibuf;
1438 struct pipe_buffer *obuf;
1439
1440 BUG_ON(nbuf >= pipe->buffers);
1441 BUG_ON(!pipe->nrbufs);
1442 ibuf = &pipe->bufs[pipe->curbuf];
1443 obuf = &bufs[nbuf];
1444
1445 if (rem >= ibuf->len) {
1446 *obuf = *ibuf;
1447 ibuf->ops = NULL;
1448 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
1449 pipe->nrbufs--;
1450 } else {
1451 ibuf->ops->get(pipe, ibuf);
1452 *obuf = *ibuf;
1453 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1454 obuf->len = rem;
1455 ibuf->offset += obuf->len;
1456 ibuf->len -= obuf->len;
1457 }
1458 nbuf++;
1459 rem -= obuf->len;
1460 }
1461 pipe_unlock(pipe);
1462
1463 fuse_copy_init(&cs, fc, 0, NULL, nbuf);
1464 cs.pipebufs = bufs;
1465 cs.pipe = pipe;
1466
1467 if (flags & SPLICE_F_MOVE)
1468 cs.move_pages = 1;
1469
1470 ret = fuse_dev_do_write(fc, &cs, len);
1471
1472 for (idx = 0; idx < nbuf; idx++) {
1473 struct pipe_buffer *buf = &bufs[idx];
1474 buf->ops->release(pipe, buf);
1475 }
1476out:
1477 kfree(bufs);
1478 return ret;
1479}
1480
1087static unsigned fuse_dev_poll(struct file *file, poll_table *wait) 1481static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1088{ 1482{
1089 unsigned mask = POLLOUT | POLLWRNORM; 1483 unsigned mask = POLLOUT | POLLWRNORM;
@@ -1225,8 +1619,10 @@ const struct file_operations fuse_dev_operations = {
1225 .llseek = no_llseek, 1619 .llseek = no_llseek,
1226 .read = do_sync_read, 1620 .read = do_sync_read,
1227 .aio_read = fuse_dev_read, 1621 .aio_read = fuse_dev_read,
1622 .splice_read = fuse_dev_splice_read,
1228 .write = do_sync_write, 1623 .write = do_sync_write,
1229 .aio_write = fuse_dev_write, 1624 .aio_write = fuse_dev_write,
1625 .splice_write = fuse_dev_splice_write,
1230 .poll = fuse_dev_poll, 1626 .poll = fuse_dev_poll,
1231 .release = fuse_dev_release, 1627 .release = fuse_dev_release,
1232 .fasync = fuse_dev_fasync, 1628 .fasync = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..3cdc5f78a406 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
1156 return 0; 1156 return 0;
1157} 1157}
1158 1158
1159static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) 1159static int fuse_dir_fsync(struct file *file, int datasync)
1160{ 1160{
1161 /* nfsd can call this with no file */ 1161 return fuse_fsync_common(file, datasync, 1);
1162 return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
1163} 1162}
1164 1163
1165static bool update_mtime(unsigned ivalid) 1164static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..ada0adeb3bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
351 fuse_release_nowrite(inode); 351 fuse_release_nowrite(inode);
352} 352}
353 353
354int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 354int fuse_fsync_common(struct file *file, int datasync, int isdir)
355 int isdir)
356{ 355{
357 struct inode *inode = de->d_inode; 356 struct inode *inode = file->f_mapping->host;
358 struct fuse_conn *fc = get_fuse_conn(inode); 357 struct fuse_conn *fc = get_fuse_conn(inode);
359 struct fuse_file *ff = file->private_data; 358 struct fuse_file *ff = file->private_data;
360 struct fuse_req *req; 359 struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
403 return err; 402 return err;
404} 403}
405 404
406static int fuse_fsync(struct file *file, struct dentry *de, int datasync) 405static int fuse_fsync(struct file *file, int datasync)
407{ 406{
408 return fuse_fsync_common(file, de, datasync, 0); 407 return fuse_fsync_common(file, datasync, 0);
409} 408}
410 409
411void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, 410void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
517 int i; 516 int i;
518 size_t count = req->misc.read.in.size; 517 size_t count = req->misc.read.in.size;
519 size_t num_read = req->out.args[0].size; 518 size_t num_read = req->out.args[0].size;
520 struct inode *inode = req->pages[0]->mapping->host; 519 struct address_space *mapping = NULL;
521 520
522 /* 521 for (i = 0; mapping == NULL && i < req->num_pages; i++)
523 * Short read means EOF. If file size is larger, truncate it 522 mapping = req->pages[i]->mapping;
524 */
525 if (!req->out.h.error && num_read < count) {
526 loff_t pos = page_offset(req->pages[0]) + num_read;
527 fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
528 }
529 523
530 fuse_invalidate_attr(inode); /* atime changed */ 524 if (mapping) {
525 struct inode *inode = mapping->host;
526
527 /*
528 * Short read means EOF. If file size is larger, truncate it
529 */
530 if (!req->out.h.error && num_read < count) {
531 loff_t pos;
532
533 pos = page_offset(req->pages[0]) + num_read;
534 fuse_read_update_size(inode, pos,
535 req->misc.read.attr_ver);
536 }
537 fuse_invalidate_attr(inode); /* atime changed */
538 }
531 539
532 for (i = 0; i < req->num_pages; i++) { 540 for (i = 0; i < req->num_pages; i++) {
533 struct page *page = req->pages[i]; 541 struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
536 else 544 else
537 SetPageError(page); 545 SetPageError(page);
538 unlock_page(page); 546 unlock_page(page);
547 page_cache_release(page);
539 } 548 }
540 if (req->ff) 549 if (req->ff)
541 fuse_file_put(req->ff); 550 fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
550 559
551 req->out.argpages = 1; 560 req->out.argpages = 1;
552 req->out.page_zeroing = 1; 561 req->out.page_zeroing = 1;
562 req->out.page_replace = 1;
553 fuse_read_fill(req, file, pos, count, FUSE_READ); 563 fuse_read_fill(req, file, pos, count, FUSE_READ);
554 req->misc.read.attr_ver = fuse_get_attr_version(fc); 564 req->misc.read.attr_ver = fuse_get_attr_version(fc);
555 if (fc->async_read) { 565 if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
589 return PTR_ERR(req); 599 return PTR_ERR(req);
590 } 600 }
591 } 601 }
602 page_cache_get(page);
592 req->pages[req->num_pages] = page; 603 req->pages[req->num_pages] = page;
593 req->num_pages++; 604 req->num_pages++;
594 return 0; 605 return 0;
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
994 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 1005 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
995 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 1006 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
996 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 1007 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
997 down_read(&current->mm->mmap_sem); 1008 npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
998 npages = get_user_pages(current, current->mm, user_addr, npages, !write,
999 0, req->pages, NULL);
1000 up_read(&current->mm->mmap_sem);
1001 if (npages < 0) 1009 if (npages < 0)
1002 return npages; 1010 return npages;
1003 1011
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1580 while (iov_iter_count(&ii)) { 1588 while (iov_iter_count(&ii)) {
1581 struct page *page = pages[page_idx++]; 1589 struct page *page = pages[page_idx++];
1582 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); 1590 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1583 void *kaddr, *map; 1591 void *kaddr;
1584 1592
1585 kaddr = map = kmap(page); 1593 kaddr = kmap(page);
1586 1594
1587 while (todo) { 1595 while (todo) {
1588 char __user *uaddr = ii.iov->iov_base + ii.iov_offset; 1596 char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..8f309f04064e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
177 /** Zero partially or not copied pages */ 177 /** Zero partially or not copied pages */
178 unsigned page_zeroing:1; 178 unsigned page_zeroing:1;
179 179
180 /** Pages may be replaced with new ones */
181 unsigned page_replace:1;
182
180 /** Number or arguments */ 183 /** Number or arguments */
181 unsigned numargs; 184 unsigned numargs;
182 185
@@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode);
568/** 571/**
569 * Send FSYNC or FSYNCDIR request 572 * Send FSYNC or FSYNCDIR request
570 */ 573 */
571int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 574int fuse_fsync_common(struct file *file, int datasync, int isdir);
572 int isdir);
573 575
574/** 576/**
575 * Notify poll wakeup 577 * Notify poll wakeup
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 9fb76b0a0485..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
236 void *buffer, size_t size, int xtype) 236 void *buffer, size_t size, int xtype)
237{ 237{
238 struct inode *inode = dentry->d_inode; 238 struct inode *inode = dentry->d_inode;
239 struct gfs2_sbd *sdp = GFS2_SB(inode);
239 struct posix_acl *acl; 240 struct posix_acl *acl;
240 int type; 241 int type;
241 int error; 242 int error;
242 243
244 if (!sdp->sd_args.ar_posix_acl)
245 return -EOPNOTSUPP;
246
243 type = gfs2_acl_type(name); 247 type = gfs2_acl_type(name);
244 if (type < 0) 248 if (type < 0)
245 return type; 249 return type;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a739a0a48067..9f8b52500d63 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -700,8 +700,14 @@ out:
700 return 0; 700 return 0;
701 701
702 page_cache_release(page); 702 page_cache_release(page);
703
704 /*
705 * XXX(hch): the call below should probably be replaced with
706 * a call to the gfs2-specific truncate blocks helper to actually
707 * release disk blocks..
708 */
703 if (pos + len > ip->i_inode.i_size) 709 if (pos + len > ip->i_inode.i_size)
704 vmtruncate(&ip->i_inode, ip->i_inode.i_size); 710 simple_setsize(&ip->i_inode, ip->i_inode.i_size);
705out_endtrans: 711out_endtrans:
706 gfs2_trans_end(sdp); 712 gfs2_trans_end(sdp);
707out_trans_fail: 713out_trans_fail:
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..ed9a94f0ef15 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
218 if (error) 218 if (error)
219 goto out_drop_write; 219 goto out_drop_write;
220 220
221 error = -EACCES;
222 if (!is_owner_or_cap(inode))
223 goto out;
224
225 error = 0;
221 flags = ip->i_diskflags; 226 flags = ip->i_diskflags;
222 new_flags = (flags & ~mask) | (reqflags & mask); 227 new_flags = (flags & ~mask) | (reqflags & mask);
223 if ((new_flags ^ flags) == 0) 228 if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
275{ 280{
276 struct inode *inode = filp->f_path.dentry->d_inode; 281 struct inode *inode = filp->f_path.dentry->d_inode;
277 u32 fsflags, gfsflags; 282 u32 fsflags, gfsflags;
283
278 if (get_user(fsflags, ptr)) 284 if (get_user(fsflags, ptr))
279 return -EFAULT; 285 return -EFAULT;
286
280 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); 287 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
281 if (!S_ISDIR(inode->i_mode)) { 288 if (!S_ISDIR(inode->i_mode)) {
282 if (gfsflags & GFS2_DIF_INHERIT_JDATA) 289 if (gfsflags & GFS2_DIF_INHERIT_JDATA)
@@ -547,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
547 * Returns: errno 554 * Returns: errno
548 */ 555 */
549 556
550static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) 557static int gfs2_fsync(struct file *file, int datasync)
551{ 558{
552 struct inode *inode = dentry->d_inode; 559 struct inode *inode = file->f_mapping->host;
553 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); 560 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
554 int ret = 0; 561 int ret = 0;
555 562
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 51d8061fa07a..b5612cbb62a5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -242,34 +242,38 @@ fail:
242} 242}
243 243
244/** 244/**
245 * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation 245 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
246 * and try to reclaim it by doing iput.
247 *
248 * This function assumes no rgrp locks are currently held.
249 *
246 * @sb: The super block 250 * @sb: The super block
247 * no_addr: The inode number 251 * no_addr: The inode number
248 * @@inode: A pointer to the inode found, if any
249 * 252 *
250 * Returns: 0 and *inode if no errors occurred. If an error occurs,
251 * the resulting *inode may or may not be NULL.
252 */ 253 */
253 254
254int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, 255void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
255 struct inode **inode)
256{ 256{
257 struct gfs2_sbd *sdp; 257 struct gfs2_sbd *sdp;
258 struct gfs2_inode *ip; 258 struct gfs2_inode *ip;
259 struct gfs2_glock *io_gl; 259 struct gfs2_glock *io_gl;
260 int error; 260 int error;
261 struct gfs2_holder gh; 261 struct gfs2_holder gh;
262 struct inode *inode;
262 263
263 *inode = gfs2_iget_skip(sb, no_addr); 264 inode = gfs2_iget_skip(sb, no_addr);
264 265
265 if (!(*inode)) 266 if (!inode)
266 return -ENOBUFS; 267 return;
267 268
268 if (!((*inode)->i_state & I_NEW)) 269 /* If it's not a new inode, someone's using it, so leave it alone. */
269 return -ENOBUFS; 270 if (!(inode->i_state & I_NEW)) {
271 iput(inode);
272 return;
273 }
270 274
271 ip = GFS2_I(*inode); 275 ip = GFS2_I(inode);
272 sdp = GFS2_SB(*inode); 276 sdp = GFS2_SB(inode);
273 ip->i_no_formal_ino = -1; 277 ip->i_no_formal_ino = -1;
274 278
275 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 279 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
@@ -284,15 +288,13 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
284 set_bit(GIF_INVALID, &ip->i_flags); 288 set_bit(GIF_INVALID, &ip->i_flags);
285 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, 289 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
286 &ip->i_iopen_gh); 290 &ip->i_iopen_gh);
287 if (unlikely(error)) { 291 if (unlikely(error))
288 if (error == GLR_TRYFAILED)
289 error = 0;
290 goto fail_iopen; 292 goto fail_iopen;
291 } 293
292 ip->i_iopen_gh.gh_gl->gl_object = ip; 294 ip->i_iopen_gh.gh_gl->gl_object = ip;
293 gfs2_glock_put(io_gl); 295 gfs2_glock_put(io_gl);
294 296
295 (*inode)->i_mode = DT2IF(DT_UNKNOWN); 297 inode->i_mode = DT2IF(DT_UNKNOWN);
296 298
297 /* 299 /*
298 * We must read the inode in order to work out its type in 300 * We must read the inode in order to work out its type in
@@ -303,16 +305,17 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
303 */ 305 */
304 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, 306 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
305 &gh); 307 &gh);
306 if (unlikely(error)) { 308 if (unlikely(error))
307 if (error == GLR_TRYFAILED)
308 error = 0;
309 goto fail_glock; 309 goto fail_glock;
310 } 310
311 /* Inode is now uptodate */ 311 /* Inode is now uptodate */
312 gfs2_glock_dq_uninit(&gh); 312 gfs2_glock_dq_uninit(&gh);
313 gfs2_set_iop(*inode); 313 gfs2_set_iop(inode);
314
315 /* The iput will cause it to be deleted. */
316 iput(inode);
317 return;
314 318
315 return 0;
316fail_glock: 319fail_glock:
317 gfs2_glock_dq(&ip->i_iopen_gh); 320 gfs2_glock_dq(&ip->i_iopen_gh);
318fail_iopen: 321fail_iopen:
@@ -321,7 +324,8 @@ fail_put:
321 ip->i_gl->gl_object = NULL; 324 ip->i_gl->gl_object = NULL;
322 gfs2_glock_put(ip->i_gl); 325 gfs2_glock_put(ip->i_gl);
323fail: 326fail:
324 return error; 327 iget_failed(inode);
328 return;
325} 329}
326 330
327static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 331static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index e161461d4c57..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -84,8 +84,7 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
84extern void gfs2_set_iop(struct inode *inode); 84extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
86 u64 no_addr, u64 no_formal_ino); 86 u64 no_addr, u64 no_formal_ino);
87extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, 87extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
88 struct inode **inode);
89extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
90 89
91extern int gfs2_inode_refresh(struct gfs2_inode *ip); 90extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b593f0e28f25..6a857e24f947 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -696,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
696 * 696 *
697 */ 697 */
698 698
699void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) 699void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
700{ 700{
701 struct gfs2_ail *ai; 701 struct gfs2_ail *ai;
702 702
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index eb570b4ad443..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,28 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
47 sdp->sd_log_head = sdp->sd_log_tail = value; 47 sdp->sd_log_head = sdp->sd_log_tail = value;
48} 48}
49 49
50unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, 50extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
51 unsigned int ssize); 51 unsigned int ssize);
52 52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 53extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_incr_head(struct gfs2_sbd *sdp); 54extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
55 55
56struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 56extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
57struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 57extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
58 struct buffer_head *real); 58 struct buffer_head *real);
59void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 59extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
60extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
61extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
60 62
61static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) 63extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
62{ 64extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
63 if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) 65extern int gfs2_logd(void *data);
64 __gfs2_log_flush(sbd, gl);
65}
66
67void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
68void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
69
70void gfs2_log_shutdown(struct gfs2_sbd *sdp);
71void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
72int gfs2_logd(void *data);
73 66
74#endif /* __LOG_DOT_H__ */ 67#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..98cdd05f3316 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1071 return error;
1072} 1072}
1073 1073
1074/*
1075 * XXX: should be changed to have proper ordering by opencoding simple_setsize
1076 */
1074static int setattr_size(struct inode *inode, struct iattr *attr) 1077static int setattr_size(struct inode *inode, struct iattr *attr)
1075{ 1078{
1076 struct gfs2_inode *ip = GFS2_I(inode); 1079 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
1081 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1082 if (error) 1085 if (error)
1083 return error; 1086 return error;
1084 error = vmtruncate(inode, attr->ia_size); 1087 error = simple_setsize(inode, attr->ia_size);
1085 gfs2_trans_end(sdp); 1088 gfs2_trans_end(sdp);
1086 if (error) 1089 if (error)
1087 return error; 1090 return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 117fa4171f62..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1192,7 +1192,6 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1192{ 1192{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 struct gfs2_alloc *al = ip->i_alloc; 1194 struct gfs2_alloc *al = ip->i_alloc;
1195 struct inode *inode;
1196 int error = 0; 1195 int error = 0;
1197 u64 last_unlinked = NO_BLOCK, unlinked; 1196 u64 last_unlinked = NO_BLOCK, unlinked;
1198 1197
@@ -1210,22 +1209,27 @@ try_again:
1210 if (error) 1209 if (error)
1211 return error; 1210 return error;
1212 1211
1212 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1213 dinodes along the way, error will equal -EAGAIN and unlinked will
1214 contains it block address. We then need to look up that inode and
1215 try to free it, and try the allocation again. */
1213 error = get_local_rgrp(ip, &unlinked, &last_unlinked); 1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1214 if (error) { 1217 if (error) {
1215 if (ip != GFS2_I(sdp->sd_rindex)) 1218 if (ip != GFS2_I(sdp->sd_rindex))
1216 gfs2_glock_dq_uninit(&al->al_ri_gh); 1219 gfs2_glock_dq_uninit(&al->al_ri_gh);
1217 if (error != -EAGAIN) 1220 if (error != -EAGAIN)
1218 return error; 1221 return error;
1219 error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb, 1222
1220 unlinked, &inode); 1223 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1221 if (inode) 1224 /* regardless of whether or not gfs2_process_unlinked_inode
1222 iput(inode); 1225 was successful, we don't want to repeat it again. */
1226 last_unlinked = unlinked;
1223 gfs2_log_flush(sdp, NULL); 1227 gfs2_log_flush(sdp, NULL);
1224 if (error == GLR_TRYFAILED) 1228 error = 0;
1225 error = 0; 1229
1226 goto try_again; 1230 goto try_again;
1227 } 1231 }
1228 1232 /* no error, so we have the rgrp set in the inode's allocation. */
1229 al->al_file = file; 1233 al->al_file = file;
1230 al->al_line = line; 1234 al->al_line = line;
1231 1235
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
494const struct file_operations hfsplus_dir_operations = { 494const struct file_operations hfsplus_dir_operations = {
495 .read = generic_read_dir, 495 .read = generic_read_dir,
496 .readdir = hfsplus_readdir, 496 .readdir = hfsplus_readdir,
497 .ioctl = hfsplus_ioctl, 497 .unlocked_ioctl = hfsplus_ioctl,
498 .llseek = generic_file_llseek, 498 .llseek = generic_file_llseek,
499 .release = hfsplus_dir_release, 499 .release = hfsplus_dir_release,
500}; 500};
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..6505c30ad965 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
337void hfsplus_delete_inode(struct inode *); 337void hfsplus_delete_inode(struct inode *);
338 338
339/* ioctl.c */ 339/* ioctl.c */
340int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 340long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
341 unsigned long arg);
342int hfsplus_setxattr(struct dentry *dentry, const char *name, 341int hfsplus_setxattr(struct dentry *dentry, const char *name,
343 const void *value, size_t size, int flags); 342 const void *value, size_t size, int flags);
344ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 343ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..9bbb82924a22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
285 .fsync = file_fsync, 285 .fsync = file_fsync,
286 .open = hfsplus_file_open, 286 .open = hfsplus_file_open,
287 .release = hfsplus_file_release, 287 .release = hfsplus_file_release,
288 .ioctl = hfsplus_ioctl, 288 .unlocked_ioctl = hfsplus_ioctl,
289}; 289};
290 290
291struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 291struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include "hfsplus_fs.h" 22#include "hfsplus_fs.h"
22 23
23int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24 unsigned long arg)
25{ 25{
26 struct inode *inode = filp->f_path.dentry->d_inode;
26 unsigned int flags; 27 unsigned int flags;
27 28
29 lock_kernel();
28 switch (cmd) { 30 switch (cmd) {
29 case HFSPLUS_IOC_EXT2_GETFLAGS: 31 case HFSPLUS_IOC_EXT2_GETFLAGS:
30 flags = 0; 32 flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
38 case HFSPLUS_IOC_EXT2_SETFLAGS: { 40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
39 int err = 0; 41 int err = 0;
40 err = mnt_want_write(filp->f_path.mnt); 42 err = mnt_want_write(filp->f_path.mnt);
41 if (err) 43 if (err) {
44 unlock_kernel();
42 return err; 45 return err;
46 }
43 47
44 if (!is_owner_or_cap(inode)) { 48 if (!is_owner_or_cap(inode)) {
45 err = -EACCES; 49 err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
85 mark_inode_dirty(inode); 89 mark_inode_dirty(inode);
86setflags_out: 90setflags_out:
87 mnt_drop_write(filp->f_path.mnt); 91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
88 return err; 93 return err;
89 } 94 }
90 default: 95 default:
96 unlock_kernel();
91 return -ENOTTY; 97 return -ENOTTY;
92 } 98 }
93} 99}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8f4cf1..87ac1891a185 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -411,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
411 return 0; 411 return 0;
412} 412}
413 413
414int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) 414int hostfs_fsync(struct file *file, int datasync)
415{ 415{
416 return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync); 416 return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
417} 417}
418 418
419static const struct file_operations hostfs_file_fops = { 419static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..a9ae9bfa752f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
19 return 0; 19 return 0;
20} 20}
21 21
22int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 22int hpfs_file_fsync(struct file *file, int datasync)
23{ 23{
24 /*return file_fsync(file, dentry);*/ 24 /*return file_fsync(file, datasync);*/
25 return 0; /* Don't fsync :-) */ 25 return 0; /* Don't fsync :-) */
26} 26}
27 27
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..75f9d4324851 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
268 268
269/* file.c */ 269/* file.c */
270 270
271int hpfs_file_fsync(struct file *, struct dentry *, int); 271int hpfs_file_fsync(struct file *, int);
272extern const struct file_operations hpfs_file_ops; 272extern const struct file_operations hpfs_file_ops;
273extern const struct inode_operations hpfs_file_iops; 273extern const struct inode_operations hpfs_file_iops;
274extern const struct address_space_operations hpfs_aops; 274extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..826c3f9d29ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
587 return err; 587 return err;
588} 588}
589 589
590static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) 590static int hppfs_fsync(struct file *file, int datasync)
591{ 591{
592 return 0; 592 return 0;
593} 593}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..a4e9a7ec3691 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,7 +688,7 @@ static void init_once(void *foo)
688const struct file_operations hugetlbfs_file_operations = { 688const struct file_operations hugetlbfs_file_operations = {
689 .read = hugetlbfs_read, 689 .read = hugetlbfs_read,
690 .mmap = hugetlbfs_file_mmap, 690 .mmap = hugetlbfs_file_mmap,
691 .fsync = simple_sync_file, 691 .fsync = noop_fsync,
692 .get_unmapped_area = hugetlb_get_unmapped_area, 692 .get_unmapped_area = hugetlb_get_unmapped_area,
693}; 693};
694 694
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
272 272
273const struct file_operations isofs_dir_operations = 273const struct file_operations isofs_dir_operations =
274{ 274{
275 .llseek = generic_file_llseek,
275 .read = generic_read_dir, 276 .read = generic_read_dir,
276 .readdir = isofs_readdir, 277 .readdir = isofs_readdir,
277}; 278};
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..e214d68620ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
1311 if (handle->h_sync) 1311 if (handle->h_sync)
1312 transaction->t_synchronous_commit = 1; 1312 transaction->t_synchronous_commit = 1;
1313 current->journal_info = NULL; 1313 current->journal_info = NULL;
1314 spin_lock(&journal->j_state_lock);
1315 spin_lock(&transaction->t_handle_lock); 1314 spin_lock(&transaction->t_handle_lock);
1316 transaction->t_outstanding_credits -= handle->h_buffer_credits; 1315 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1317 transaction->t_updates--; 1316 transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
1340 jbd_debug(2, "transaction too old, requesting commit for " 1339 jbd_debug(2, "transaction too old, requesting commit for "
1341 "handle %p\n", handle); 1340 "handle %p\n", handle);
1342 /* This is non-blocking */ 1341 /* This is non-blocking */
1343 __jbd2_log_start_commit(journal, transaction->t_tid); 1342 jbd2_log_start_commit(journal, transaction->t_tid);
1344 spin_unlock(&journal->j_state_lock);
1345 1343
1346 /* 1344 /*
1347 * Special case: JBD2_SYNC synchronous updates require us 1345 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
1351 err = jbd2_log_wait_commit(journal, tid); 1349 err = jbd2_log_wait_commit(journal, tid);
1352 } else { 1350 } else {
1353 spin_unlock(&transaction->t_handle_lock); 1351 spin_unlock(&transaction->t_handle_lock);
1354 spin_unlock(&journal->j_state_lock);
1355 } 1352 }
1356 1353
1357 lock_map_release(&handle->h_lockdep_map); 1354 lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a19..813497024437 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -26,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
26 struct page **pagep, void **fsdata); 26 struct page **pagep, void **fsdata);
27static int jffs2_readpage (struct file *filp, struct page *pg); 27static int jffs2_readpage (struct file *filp, struct page *pg);
28 28
29int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync) 29int jffs2_fsync(struct file *filp, int datasync)
30{ 30{
31 struct inode *inode = dentry->d_inode; 31 struct inode *inode = filp->f_mapping->host;
32 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); 32 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
33 33
34 /* Trigger GC to flush any pending writes for this inode */ 34 /* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 86e0821fc989..8bc2c80ab159 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
169 mutex_unlock(&f->sem); 169 mutex_unlock(&f->sem);
170 jffs2_complete_reservation(c); 170 jffs2_complete_reservation(c);
171 171
172 /* We have to do the vmtruncate() without f->sem held, since 172 /* We have to do the simple_setsize() without f->sem held, since
173 some pages may be locked and waiting for it in readpage(). 173 some pages may be locked and waiting for it in readpage().
174 We are protected from a simultaneous write() extending i_size 174 We are protected from a simultaneous write() extending i_size
175 back past iattr->ia_size, because do_truncate() holds the 175 back past iattr->ia_size, because do_truncate() holds the
176 generic inode semaphore. */ 176 generic inode semaphore. */
177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { 177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
178 vmtruncate(inode, iattr->ia_size); 178 simple_setsize(inode, iattr->ia_size);
179 inode->i_blocks = (inode->i_size + 511) >> 9; 179 inode->i_blocks = (inode->i_size + 511) >> 9;
180 } 180 }
181 181
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 035a767f958b..4791aacf3084 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
158extern const struct file_operations jffs2_file_operations; 158extern const struct file_operations jffs2_file_operations;
159extern const struct inode_operations jffs2_file_inode_operations; 159extern const struct inode_operations jffs2_file_inode_operations;
160extern const struct address_space_operations jffs2_file_address_operations; 160extern const struct address_space_operations jffs2_file_address_operations;
161int jffs2_fsync(struct file *, struct dentry *, int); 161int jffs2_fsync(struct file *, int);
162int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); 162int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
163 163
164/* ioctl.c */ 164/* ioctl.c */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 85d9ec659225..127263cc8657 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -27,9 +27,9 @@
27#include "jfs_acl.h" 27#include "jfs_acl.h"
28#include "jfs_debug.h" 28#include "jfs_debug.h"
29 29
30int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) 30int jfs_fsync(struct file *file, int datasync)
31{ 31{
32 struct inode *inode = dentry->d_inode; 32 struct inode *inode = file->f_mapping->host;
33 int rc = 0; 33 int rc = 0;
34 34
35 if (!(inode->i_state & I_DIRTY) || 35 if (!(inode->i_state & I_DIRTY) ||
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9e6bda30a6e8..11042b1f44b5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
21struct fid; 21struct fid;
22 22
23extern struct inode *ialloc(struct inode *, umode_t); 23extern struct inode *ialloc(struct inode *, umode_t);
24extern int jfs_fsync(struct file *, struct dentry *, int); 24extern int jfs_fsync(struct file *, int);
25extern long jfs_ioctl(struct file *, unsigned int, unsigned long); 25extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); 26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b66832ac33ac..b38f96bef829 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -179,6 +179,8 @@ static void jfs_put_super(struct super_block *sb)
179 179
180 jfs_info("In jfs_put_super"); 180 jfs_info("In jfs_put_super");
181 181
182 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
183
182 lock_kernel(); 184 lock_kernel();
183 185
184 rc = jfs_umount(sb); 186 rc = jfs_umount(sb);
@@ -396,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
396 398
397 JFS_SBI(sb)->flag = flag; 399 JFS_SBI(sb)->flag = flag;
398 ret = jfs_mount_rw(sb, 1); 400 ret = jfs_mount_rw(sb, 1);
401
402 /* mark the fs r/w for quota activity */
403 sb->s_flags &= ~MS_RDONLY;
404
399 unlock_kernel(); 405 unlock_kernel();
406 dquot_resume(sb, -1);
400 return ret; 407 return ret;
401 } 408 }
402 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 409 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
410 rc = dquot_suspend(sb, -1);
411 if (rc < 0) {
412 unlock_kernel();
413 return rc;
414 }
403 rc = jfs_umount_rw(sb); 415 rc = jfs_umount_rw(sb);
404 JFS_SBI(sb)->flag = flag; 416 JFS_SBI(sb)->flag = flag;
405 unlock_kernel(); 417 unlock_kernel();
@@ -469,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
469 */ 481 */
470 sb->s_op = &jfs_super_operations; 482 sb->s_op = &jfs_super_operations;
471 sb->s_export_op = &jfs_export_operations; 483 sb->s_export_op = &jfs_export_operations;
484#ifdef CONFIG_QUOTA
485 sb->dq_op = &dquot_operations;
486 sb->s_qcop = &dquot_quotactl_ops;
487#endif
472 488
473 /* 489 /*
474 * Initialize direct-mapping inode/address-space 490 * Initialize direct-mapping inode/address-space
diff --git a/fs/libfs.c b/fs/libfs.c
index 232bea425b09..09e1016eb774 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/vfs.h> 10#include <linux/vfs.h>
11#include <linux/quotaops.h>
11#include <linux/mutex.h> 12#include <linux/mutex.h>
12#include <linux/exportfs.h> 13#include <linux/exportfs.h>
13#include <linux/writeback.h> 14#include <linux/writeback.h>
@@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
58 return NULL; 59 return NULL;
59} 60}
60 61
61int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
62{
63 return 0;
64}
65
66int dcache_dir_open(struct inode *inode, struct file *file) 62int dcache_dir_open(struct inode *inode, struct file *file)
67{ 63{
68 static struct qstr cursor_name = {.len = 1, .name = "."}; 64 static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = {
190 .llseek = dcache_dir_lseek, 186 .llseek = dcache_dir_lseek,
191 .read = generic_read_dir, 187 .read = generic_read_dir,
192 .readdir = dcache_readdir, 188 .readdir = dcache_readdir,
193 .fsync = simple_sync_file, 189 .fsync = noop_fsync,
194}; 190};
195 191
196const struct inode_operations simple_dir_inode_operations = { 192const struct inode_operations simple_dir_inode_operations = {
@@ -330,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
330 return 0; 326 return 0;
331} 327}
332 328
329/**
330 * simple_setsize - handle core mm and vfs requirements for file size change
331 * @inode: inode
332 * @newsize: new file size
333 *
334 * Returns 0 on success, -error on failure.
335 *
336 * simple_setsize must be called with inode_mutex held.
337 *
338 * simple_setsize will check that the requested new size is OK (see
339 * inode_newsize_ok), and then will perform the necessary i_size update
340 * and pagecache truncation (if necessary). It will be typically be called
341 * from the filesystem's setattr function when ATTR_SIZE is passed in.
342 *
343 * The inode itself must have correct permissions and attributes to allow
344 * i_size to be changed, this function then just checks that the new size
345 * requested is valid.
346 *
347 * In the case of simple in-memory filesystems with inodes stored solely
348 * in the inode cache, and file data in the pagecache, nothing more needs
349 * to be done to satisfy a truncate request. Filesystems with on-disk
350 * blocks for example will need to free them in the case of truncate, in
351 * that case it may be easier not to use simple_setsize (but each of its
352 * components will likely be required at some point to update pagecache
353 * and inode etc).
354 */
355int simple_setsize(struct inode *inode, loff_t newsize)
356{
357 loff_t oldsize;
358 int error;
359
360 error = inode_newsize_ok(inode, newsize);
361 if (error)
362 return error;
363
364 oldsize = inode->i_size;
365 i_size_write(inode, newsize);
366 truncate_pagecache(inode, oldsize, newsize);
367
368 return error;
369}
370EXPORT_SYMBOL(simple_setsize);
371
372/**
373 * simple_setattr - setattr for simple in-memory filesystem
374 * @dentry: dentry
375 * @iattr: iattr structure
376 *
377 * Returns 0 on success, -error on failure.
378 *
379 * simple_setattr implements setattr for an in-memory filesystem which
380 * does not store its own file data or metadata (eg. uses the page cache
381 * and inode cache as its data store).
382 */
383int simple_setattr(struct dentry *dentry, struct iattr *iattr)
384{
385 struct inode *inode = dentry->d_inode;
386 int error;
387
388 error = inode_change_ok(inode, iattr);
389 if (error)
390 return error;
391
392 if (iattr->ia_valid & ATTR_SIZE) {
393 error = simple_setsize(inode, iattr->ia_size);
394 if (error)
395 return error;
396 }
397
398 generic_setattr(inode, iattr);
399
400 return error;
401}
402EXPORT_SYMBOL(simple_setattr);
403
333int simple_readpage(struct file *file, struct page *page) 404int simple_readpage(struct file *file, struct page *page)
334{ 405{
335 clear_highpage(page); 406 clear_highpage(page);
@@ -851,13 +922,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
851} 922}
852EXPORT_SYMBOL_GPL(generic_fh_to_parent); 923EXPORT_SYMBOL_GPL(generic_fh_to_parent);
853 924
854int simple_fsync(struct file *file, struct dentry *dentry, int datasync) 925/**
926 * generic_file_fsync - generic fsync implementation for simple filesystems
927 * @file: file to synchronize
928 * @datasync: only synchronize essential metadata if true
929 *
930 * This is a generic implementation of the fsync method for simple
931 * filesystems which track all non-inode metadata in the buffers list
932 * hanging off the address_space structure.
933 */
934int generic_file_fsync(struct file *file, int datasync)
855{ 935{
856 struct writeback_control wbc = { 936 struct writeback_control wbc = {
857 .sync_mode = WB_SYNC_ALL, 937 .sync_mode = WB_SYNC_ALL,
858 .nr_to_write = 0, /* metadata-only; caller takes care of data */ 938 .nr_to_write = 0, /* metadata-only; caller takes care of data */
859 }; 939 };
860 struct inode *inode = dentry->d_inode; 940 struct inode *inode = file->f_mapping->host;
861 int err; 941 int err;
862 int ret; 942 int ret;
863 943
@@ -872,7 +952,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
872 ret = err; 952 ret = err;
873 return ret; 953 return ret;
874} 954}
875EXPORT_SYMBOL(simple_fsync); 955EXPORT_SYMBOL(generic_file_fsync);
956
957/*
958 * No-op implementation of ->fsync for in-memory filesystems.
959 */
960int noop_fsync(struct file *file, int datasync)
961{
962 return 0;
963}
876 964
877EXPORT_SYMBOL(dcache_dir_close); 965EXPORT_SYMBOL(dcache_dir_close);
878EXPORT_SYMBOL(dcache_dir_lseek); 966EXPORT_SYMBOL(dcache_dir_lseek);
@@ -895,7 +983,7 @@ EXPORT_SYMBOL(simple_release_fs);
895EXPORT_SYMBOL(simple_rename); 983EXPORT_SYMBOL(simple_rename);
896EXPORT_SYMBOL(simple_rmdir); 984EXPORT_SYMBOL(simple_rmdir);
897EXPORT_SYMBOL(simple_statfs); 985EXPORT_SYMBOL(simple_statfs);
898EXPORT_SYMBOL(simple_sync_file); 986EXPORT_SYMBOL(noop_fsync);
899EXPORT_SYMBOL(simple_unlink); 987EXPORT_SYMBOL(simple_unlink);
900EXPORT_SYMBOL(simple_read_from_buffer); 988EXPORT_SYMBOL(simple_read_from_buffer);
901EXPORT_SYMBOL(simple_write_to_buffer); 989EXPORT_SYMBOL(simple_write_to_buffer);
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 0de524071870..abe1cafbd4c2 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -219,9 +219,9 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
219 } 219 }
220} 220}
221 221
222int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) 222int logfs_fsync(struct file *file, int datasync)
223{ 223{
224 struct super_block *sb = dentry->d_inode->i_sb; 224 struct super_block *sb = file->f_mapping->host->i_sb;
225 225
226 logfs_write_anchor(sb); 226 logfs_write_anchor(sb);
227 return 0; 227 return 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 1a9db84f8d8f..c838c4d72111 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops;
506int logfs_readpage(struct file *file, struct page *page); 506int logfs_readpage(struct file *file, struct page *page);
507int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, 507int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
508 unsigned long arg); 508 unsigned long arg);
509int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); 509int logfs_fsync(struct file *file, int datasync);
510 510
511/* gc.c */ 511/* gc.c */
512u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); 512u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..91969589131c 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .read = generic_read_dir, 23 .read = generic_read_dir,
24 .readdir = minix_readdir, 24 .readdir = minix_readdir,
25 .fsync = simple_fsync, 25 .fsync = generic_file_fsync,
26}; 26};
27 27
28static inline void dir_put_page(struct page *page) 28static inline void dir_put_page(struct page *page)
@@ -72,11 +72,8 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
72{ 72{
73 struct address_space *mapping = dir->i_mapping; 73 struct address_space *mapping = dir->i_mapping;
74 struct page *page = read_mapping_page(mapping, n, NULL); 74 struct page *page = read_mapping_page(mapping, n, NULL);
75 if (!IS_ERR(page)) { 75 if (!IS_ERR(page))
76 kmap(page); 76 kmap(page);
77 if (!PageUptodate(page))
78 goto fail;
79 }
80 return page; 77 return page;
81 78
82fail: 79fail:
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..d5320ff23faf 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = {
19 .write = do_sync_write, 19 .write = do_sync_write,
20 .aio_write = generic_file_aio_write, 20 .aio_write = generic_file_aio_write,
21 .mmap = generic_file_mmap, 21 .mmap = generic_file_mmap,
22 .fsync = simple_fsync, 22 .fsync = generic_file_fsync,
23 .splice_read = generic_file_splice_read, 23 .splice_read = generic_file_splice_read,
24}; 24};
25 25
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
20 return (block_t *)minix_i(inode)->u.i2_data; 20 return (block_t *)minix_i(inode)->u.i2_data;
21} 21}
22 22
23#define DIRCOUNT 7
24#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
25
23static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) 26static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
24{ 27{
25 int n = 0; 28 int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
34 printk("MINIX-fs: block_to_path: " 37 printk("MINIX-fs: block_to_path: "
35 "block %ld too big on dev %s\n", 38 "block %ld too big on dev %s\n",
36 block, bdevname(sb->s_bdev, b)); 39 block, bdevname(sb->s_bdev, b));
37 } else if (block < 7) { 40 } else if (block < DIRCOUNT) {
38 offsets[n++] = block; 41 offsets[n++] = block;
39 } else if ((block -= 7) < 256) { 42 } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
40 offsets[n++] = 7; 43 offsets[n++] = DIRCOUNT;
41 offsets[n++] = block; 44 offsets[n++] = block;
42 } else if ((block -= 256) < 256*256) { 45 } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
43 offsets[n++] = 8; 46 offsets[n++] = DIRCOUNT + 1;
44 offsets[n++] = block>>8; 47 offsets[n++] = block / INDIRCOUNT(sb);
45 offsets[n++] = block & 255; 48 offsets[n++] = block % INDIRCOUNT(sb);
46 } else { 49 } else {
47 block -= 256*256; 50 block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
48 offsets[n++] = 9; 51 offsets[n++] = DIRCOUNT + 2;
49 offsets[n++] = block>>16; 52 offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
50 offsets[n++] = (block>>8) & 255; 53 offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
51 offsets[n++] = block & 255; 54 offsets[n++] = block % INDIRCOUNT(sb);
52 } 55 }
53 return n; 56 return n;
54} 57}
diff --git a/fs/namei.c b/fs/namei.c
index 48e1f60520ea..868d0cb9d473 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1621,6 +1621,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1621 case LAST_DOTDOT: 1621 case LAST_DOTDOT:
1622 follow_dotdot(nd); 1622 follow_dotdot(nd);
1623 dir = nd->path.dentry; 1623 dir = nd->path.dentry;
1624 case LAST_DOT:
1624 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { 1625 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1625 if (!dir->d_op->d_revalidate(dir, nd)) { 1626 if (!dir->d_op->d_revalidate(dir, nd)) {
1626 error = -ESTALE; 1627 error = -ESTALE;
@@ -1628,7 +1629,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1628 } 1629 }
1629 } 1630 }
1630 /* fallthrough */ 1631 /* fallthrough */
1631 case LAST_DOT:
1632 case LAST_ROOT: 1632 case LAST_ROOT:
1633 if (open_flag & O_CREAT) 1633 if (open_flag & O_CREAT)
1634 goto exit; 1634 goto exit;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7edfcd4d5e52..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
49 49
50const struct file_operations ncp_dir_operations = 50const struct file_operations ncp_dir_operations =
51{ 51{
52 .llseek = generic_file_llseek,
52 .read = generic_read_dir, 53 .read = generic_read_dir,
53 .readdir = ncp_readdir, 54 .readdir = ncp_readdir,
54 .ioctl = ncp_ioctl, 55 .unlocked_ioctl = ncp_ioctl,
55#ifdef CONFIG_COMPAT 56#ifdef CONFIG_COMPAT
56 .compat_ioctl = ncp_compat_ioctl, 57 .compat_ioctl = ncp_compat_ioctl,
57#endif 58#endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1daabb90e0a5..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -22,7 +22,7 @@
22#include <linux/ncp_fs.h> 22#include <linux/ncp_fs.h>
23#include "ncplib_kernel.h" 23#include "ncplib_kernel.h"
24 24
25static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync) 25static int ncp_fsync(struct file *file, int datasync)
26{ 26{
27 return 0; 27 return 0;
28} 28}
@@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations =
295 .llseek = ncp_remote_llseek, 295 .llseek = ncp_remote_llseek,
296 .read = ncp_file_read, 296 .read = ncp_file_read,
297 .write = ncp_file_write, 297 .write = ncp_file_write,
298 .ioctl = ncp_ioctl, 298 .unlocked_ioctl = ncp_ioctl,
299#ifdef CONFIG_COMPAT 299#ifdef CONFIG_COMPAT
300 .compat_ioctl = ncp_compat_ioctl, 300 .compat_ioctl = ncp_compat_ioctl,
301#endif 301#endif
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60a5e2864ea8..023c03d02070 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,6 +20,7 @@
20#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/smp_lock.h>
23 24
24#include <linux/ncp_fs.h> 25#include <linux/ncp_fs.h>
25 26
@@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
261} 262}
262#endif /* CONFIG_NCPFS_NLS */ 263#endif /* CONFIG_NCPFS_NLS */
263 264
264static int __ncp_ioctl(struct inode *inode, struct file *filp, 265static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
265 unsigned int cmd, unsigned long arg)
266{ 266{
267 struct inode *inode = filp->f_dentry->d_inode;
267 struct ncp_server *server = NCP_SERVER(inode); 268 struct ncp_server *server = NCP_SERVER(inode);
268 int result; 269 int result;
269 struct ncp_ioctl_request request; 270 struct ncp_ioctl_request request;
@@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
841 } 842 }
842} 843}
843 844
844int ncp_ioctl(struct inode *inode, struct file *filp, 845long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
845 unsigned int cmd, unsigned long arg)
846{ 846{
847 int ret; 847 long ret;
848 848
849 lock_kernel();
849 if (ncp_ioctl_need_write(cmd)) { 850 if (ncp_ioctl_need_write(cmd)) {
850 /* 851 /*
851 * inside the ioctl(), any failures which 852 * inside the ioctl(), any failures which
@@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
853 * -EACCESS, so it seems consistent to keep 854 * -EACCESS, so it seems consistent to keep
854 * that here. 855 * that here.
855 */ 856 */
856 if (mnt_want_write(filp->f_path.mnt)) 857 if (mnt_want_write(filp->f_path.mnt)) {
857 return -EACCES; 858 ret = -EACCES;
859 goto out;
860 }
858 } 861 }
859 ret = __ncp_ioctl(inode, filp, cmd, arg); 862 ret = __ncp_ioctl(filp, cmd, arg);
860 if (ncp_ioctl_need_write(cmd)) 863 if (ncp_ioctl_need_write(cmd))
861 mnt_drop_write(filp->f_path.mnt); 864 mnt_drop_write(filp->f_path.mnt);
865
866out:
867 unlock_kernel();
862 return ret; 868 return ret;
863} 869}
864 870
865#ifdef CONFIG_COMPAT 871#ifdef CONFIG_COMPAT
866long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 872long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
867{ 873{
868 struct inode *inode = file->f_path.dentry->d_inode; 874 long ret;
869 int ret;
870 875
871 lock_kernel(); 876 lock_kernel();
872 arg = (unsigned long) compat_ptr(arg); 877 arg = (unsigned long) compat_ptr(arg);
873 ret = ncp_ioctl(inode, file, cmd, arg); 878 ret = ncp_ioctl(file, cmd, arg);
874 unlock_kernel(); 879 unlock_kernel();
875 return ret; 880 return ret;
876} 881}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ee9a179ebdf3..782b431ef91c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); 53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
54static int nfs_rename(struct inode *, struct dentry *, 54static int nfs_rename(struct inode *, struct dentry *,
55 struct inode *, struct dentry *); 55 struct inode *, struct dentry *);
56static int nfs_fsync_dir(struct file *, struct dentry *, int); 56static int nfs_fsync_dir(struct file *, int);
57static loff_t nfs_llseek_dir(struct file *, loff_t, int); 57static loff_t nfs_llseek_dir(struct file *, loff_t, int);
58 58
59const struct file_operations nfs_dir_operations = { 59const struct file_operations nfs_dir_operations = {
@@ -641,8 +641,10 @@ out:
641 * All directory operations under NFS are synchronous, so fsync() 641 * All directory operations under NFS are synchronous, so fsync()
642 * is a dummy operation. 642 * is a dummy operation.
643 */ 643 */
644static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) 644static int nfs_fsync_dir(struct file *filp, int datasync)
645{ 645{
646 struct dentry *dentry = filp->f_path.dentry;
647
646 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", 648 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
647 dentry->d_parent->d_name.name, dentry->d_name.name, 649 dentry->d_parent->d_name.name, dentry->d_name.name,
648 datasync); 650 datasync);
@@ -1741,6 +1743,7 @@ remove_lru_entry:
1741 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); 1743 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1742 smp_mb__after_clear_bit(); 1744 smp_mb__after_clear_bit();
1743 } 1745 }
1746 spin_unlock(&inode->i_lock);
1744 } 1747 }
1745 spin_unlock(&nfs_access_lru_lock); 1748 spin_unlock(&nfs_access_lru_lock);
1746 nfs_access_free_list(&head); 1749 nfs_access_free_list(&head);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cac96bcc91e4..36a5e74f51b4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -53,7 +53,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
54 unsigned long nr_segs, loff_t pos); 54 unsigned long nr_segs, loff_t pos);
55static int nfs_file_flush(struct file *, fl_owner_t id); 55static int nfs_file_flush(struct file *, fl_owner_t id);
56static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync); 56static int nfs_file_fsync(struct file *, int datasync);
57static int nfs_check_flags(int flags); 57static int nfs_check_flags(int flags);
58static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); 58static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -322,8 +322,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
322 * whether any write errors occurred for this process. 322 * whether any write errors occurred for this process.
323 */ 323 */
324static int 324static int
325nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 325nfs_file_fsync(struct file *file, int datasync)
326{ 326{
327 struct dentry *dentry = file->f_path.dentry;
327 struct nfs_open_context *ctx = nfs_file_open_context(file); 328 struct nfs_open_context *ctx = nfs_file_open_context(file);
328 struct inode *inode = dentry->d_inode; 329 struct inode *inode = dentry->d_inode;
329 330
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f8b1157daa2..04214fc5c304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
1060 goto out_nomem; 1060 goto out_nomem;
1061 rc = strict_strtoul(string, 10, &option); 1061 rc = strict_strtoul(string, 10, &option);
1062 kfree(string); 1062 kfree(string);
1063 if (rc != 0 || option > USHORT_MAX) 1063 if (rc != 0 || option > USHRT_MAX)
1064 goto out_invalid_value; 1064 goto out_invalid_value;
1065 mnt->nfs_server.port = option; 1065 mnt->nfs_server.port = option;
1066 break; 1066 break;
@@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
1181 goto out_nomem; 1181 goto out_nomem;
1182 rc = strict_strtoul(string, 10, &option); 1182 rc = strict_strtoul(string, 10, &option);
1183 kfree(string); 1183 kfree(string);
1184 if (rc != 0 || option > USHORT_MAX) 1184 if (rc != 0 || option > USHRT_MAX)
1185 goto out_invalid_value; 1185 goto out_invalid_value;
1186 mnt->mount_server.port = option; 1186 mnt->mount_server.port = option;
1187 break; 1187 break;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..91679e2631ee 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1386,7 +1386,7 @@ static int nfs_commit_inode(struct inode *inode, int how)
1386 int res = 0; 1386 int res = 0;
1387 1387
1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) 1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
1389 goto out; 1389 goto out_mark_dirty;
1390 spin_lock(&inode->i_lock); 1390 spin_lock(&inode->i_lock);
1391 res = nfs_scan_commit(inode, &head, 0, 0); 1391 res = nfs_scan_commit(inode, &head, 0, 0);
1392 spin_unlock(&inode->i_lock); 1392 spin_unlock(&inode->i_lock);
@@ -1398,9 +1398,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, 1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
1399 nfs_wait_bit_killable, 1399 nfs_wait_bit_killable,
1400 TASK_KILLABLE); 1400 TASK_KILLABLE);
1401 else
1402 goto out_mark_dirty;
1401 } else 1403 } else
1402 nfs_commit_clear_lock(NFS_I(inode)); 1404 nfs_commit_clear_lock(NFS_I(inode));
1403out: 1405 return res;
1406 /* Note: If we exit without ensuring that the commit is complete,
1407 * we must mark the inode as dirty. Otherwise, future calls to
1408 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
1409 * that the data is on the disk.
1410 */
1411out_mark_dirty:
1412 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1404 return res; 1413 return res;
1405} 1414}
1406 1415
@@ -1509,14 +1518,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
1509 }; 1518 };
1510 int ret; 1519 int ret;
1511 1520
1512 while(PagePrivate(page)) { 1521 for (;;) {
1513 wait_on_page_writeback(page); 1522 wait_on_page_writeback(page);
1514 if (clear_page_dirty_for_io(page)) { 1523 if (clear_page_dirty_for_io(page)) {
1515 ret = nfs_writepage_locked(page, &wbc); 1524 ret = nfs_writepage_locked(page, &wbc);
1516 if (ret < 0) 1525 if (ret < 0)
1517 goto out_error; 1526 goto out_error;
1527 continue;
1518 } 1528 }
1519 ret = sync_inode(inode, &wbc); 1529 if (!PagePrivate(page))
1530 break;
1531 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1520 if (ret < 0) 1532 if (ret < 0)
1521 goto out_error; 1533 goto out_error;
1522 } 1534 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc3194ea01f5..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
998 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 998 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
999 return -EINVAL; 999 return -EINVAL;
1000 1000
1001 if (port < 1 || port > USHORT_MAX) 1001 if (port < 1 || port > USHRT_MAX)
1002 return -EINVAL; 1002 return -EINVAL;
1003 1003
1004 err = nfsd_create_serv(); 1004 err = nfsd_create_serv();
@@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) 1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
1041 return -EINVAL; 1041 return -EINVAL;
1042 1042
1043 if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) 1043 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
1044 return -EINVAL; 1044 return -EINVAL;
1045 1045
1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); 1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
27#include "nilfs.h" 27#include "nilfs.h"
28#include "segment.h" 28#include "segment.h"
29 29
30int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 30int nilfs_sync_file(struct file *file, int datasync)
31{ 31{
32 /* 32 /*
33 * Called from fsync() system call 33 * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
37 * This function should be implemented when the writeback function 37 * This function should be implemented when the writeback function
38 * will be implemented. 38 * will be implemented.
39 */ 39 */
40 struct inode *inode = dentry->d_inode; 40 struct inode *inode = file->f_mapping->host;
41 int err; 41 int err;
42 42
43 if (!nilfs_inode_dirty(inode)) 43 if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..47d6d7928122 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
228 struct page *, struct inode *); 228 struct page *, struct inode *);
229 229
230/* file.c */ 230/* file.c */
231extern int nilfs_sync_file(struct file *, struct dentry *, int); 231extern int nilfs_sync_file(struct file *, int);
232 232
233/* ioctl.c */ 233/* ioctl.c */
234long nilfs_ioctl(struct file *, unsigned int, unsigned long); 234long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3feee4a..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
1527 * this problem for now. We do write the $BITMAP attribute if it is present 1527 * this problem for now. We do write the $BITMAP attribute if it is present
1528 * which is the important one for a directory so things are not too bad. 1528 * which is the important one for a directory so things are not too bad.
1529 */ 1529 */
1530static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, 1530static int ntfs_dir_fsync(struct file *filp, int datasync)
1531 int datasync)
1532{ 1531{
1533 struct inode *bmp_vi, *vi = dentry->d_inode; 1532 struct inode *bmp_vi, *vi = filp->f_mapping->host;
1534 int err, ret; 1533 int err, ret;
1535 ntfs_attr na; 1534 ntfs_attr na;
1536 1535
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba75..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
98 * the page at all. For a more detailed explanation see ntfs_truncate() in 98 * the page at all. For a more detailed explanation see ntfs_truncate() in
99 * fs/ntfs/inode.c. 99 * fs/ntfs/inode.c.
100 * 100 *
101 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
102 * pages.
103 *
104 * Return 0 on success and -errno on error. In the case that an error is 101 * Return 0 on success and -errno on error. In the case that an error is
105 * encountered it is possible that the initialized size will already have been 102 * encountered it is possible that the initialized size will already have been
106 * incremented some way towards @new_init_size but it is guaranteed that if 103 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
110 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 107 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
111 * held by the caller. 108 * held by the caller.
112 */ 109 */
113static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, 110static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
114 struct page **cached_page, struct pagevec *lru_pvec)
115{ 111{
116 s64 old_init_size; 112 s64 old_init_size;
117 loff_t old_i_size; 113 loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
403 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 399 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
404 * starting at index @index. 400 * starting at index @index.
405 * 401 *
406 * If a page is newly created, increment its refcount and add it to the 402 * If a page is newly created, add it to lru list
407 * caller's lru-buffering pagevec @lru_pvec.
408 *
409 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
410 * are obtained at once instead of just one page and that 0 is returned on
411 * success and -errno on error.
412 * 403 *
413 * Note, the page locks are obtained in ascending page index order. 404 * Note, the page locks are obtained in ascending page index order.
414 */ 405 */
415static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 406static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
416 pgoff_t index, const unsigned nr_pages, struct page **pages, 407 pgoff_t index, const unsigned nr_pages, struct page **pages,
417 struct page **cached_page, struct pagevec *lru_pvec) 408 struct page **cached_page)
418{ 409{
419 int err, nr; 410 int err, nr;
420 411
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
430 goto err_out; 421 goto err_out;
431 } 422 }
432 } 423 }
433 err = add_to_page_cache(*cached_page, mapping, index, 424 err = add_to_page_cache_lru(*cached_page, mapping, index,
434 GFP_KERNEL); 425 GFP_KERNEL);
435 if (unlikely(err)) { 426 if (unlikely(err)) {
436 if (err == -EEXIST) 427 if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
438 goto err_out; 429 goto err_out;
439 } 430 }
440 pages[nr] = *cached_page; 431 pages[nr] = *cached_page;
441 page_cache_get(*cached_page);
442 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
443 __pagevec_lru_add_file(lru_pvec);
444 *cached_page = NULL; 432 *cached_page = NULL;
445 } 433 }
446 index++; 434 index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1800 ssize_t status, written; 1788 ssize_t status, written;
1801 unsigned nr_pages; 1789 unsigned nr_pages;
1802 int err; 1790 int err;
1803 struct pagevec lru_pvec;
1804 1791
1805 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 1792 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1806 "pos 0x%llx, count 0x%lx.", 1793 "pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1912 } 1899 }
1913 } 1900 }
1914 } 1901 }
1915 pagevec_init(&lru_pvec, 0);
1916 written = 0; 1902 written = 0;
1917 /* 1903 /*
1918 * If the write starts beyond the initialized size, extend it up to the 1904 * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1925 ll = ni->initialized_size; 1911 ll = ni->initialized_size;
1926 read_unlock_irqrestore(&ni->size_lock, flags); 1912 read_unlock_irqrestore(&ni->size_lock, flags);
1927 if (pos > ll) { 1913 if (pos > ll) {
1928 err = ntfs_attr_extend_initialized(ni, pos, &cached_page, 1914 err = ntfs_attr_extend_initialized(ni, pos);
1929 &lru_pvec);
1930 if (err < 0) { 1915 if (err < 0) {
1931 ntfs_error(vol->sb, "Cannot perform write to inode " 1916 ntfs_error(vol->sb, "Cannot perform write to inode "
1932 "0x%lx, attribute type 0x%x, because " 1917 "0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2012 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); 1997 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2013 /* Get and lock @do_pages starting at index @start_idx. */ 1998 /* Get and lock @do_pages starting at index @start_idx. */
2014 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1999 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2015 pages, &cached_page, &lru_pvec); 2000 pages, &cached_page);
2016 if (unlikely(status)) 2001 if (unlikely(status))
2017 break; 2002 break;
2018 /* 2003 /*
@@ -2077,7 +2062,6 @@ err_out:
2077 *ppos = pos; 2062 *ppos = pos;
2078 if (cached_page) 2063 if (cached_page)
2079 page_cache_release(cached_page); 2064 page_cache_release(cached_page);
2080 pagevec_lru_add_file(&lru_pvec);
2081 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2065 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2082 written ? "written" : "status", (unsigned long)written, 2066 written ? "written" : "status", (unsigned long)written,
2083 (long)status); 2067 (long)status);
@@ -2149,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2149/** 2133/**
2150 * ntfs_file_fsync - sync a file to disk 2134 * ntfs_file_fsync - sync a file to disk
2151 * @filp: file to be synced 2135 * @filp: file to be synced
2152 * @dentry: dentry describing the file to sync
2153 * @datasync: if non-zero only flush user data and not metadata 2136 * @datasync: if non-zero only flush user data and not metadata
2154 * 2137 *
2155 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 2138 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
@@ -2165,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2165 * Also, if @datasync is true, we do not wait on the inode to be written out 2148 * Also, if @datasync is true, we do not wait on the inode to be written out
2166 * but we always wait on the page cache pages to be written out. 2149 * but we always wait on the page cache pages to be written out.
2167 * 2150 *
2168 * Note: In the past @filp could be NULL so we ignore it as we don't need it
2169 * anyway.
2170 *
2171 * Locking: Caller must hold i_mutex on the inode. 2151 * Locking: Caller must hold i_mutex on the inode.
2172 * 2152 *
2173 * TODO: We should probably also write all attribute/index inodes associated 2153 * TODO: We should probably also write all attribute/index inodes associated
2174 * with this inode but since we have no simple way of getting to them we ignore 2154 * with this inode but since we have no simple way of getting to them we ignore
2175 * this problem for now. 2155 * this problem for now.
2176 */ 2156 */
2177static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, 2157static int ntfs_file_fsync(struct file *filp, int datasync)
2178 int datasync)
2179{ 2158{
2180 struct inode *vi = dentry->d_inode; 2159 struct inode *vi = filp->f_mapping->host;
2181 int err, ret = 0; 2160 int err, ret = 0;
2182 2161
2183 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2162 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
404 * larger than 16 bits. 404 * larger than 16 bits.
405 */ 405 */
406 BUG_ON(ecc > USHORT_MAX); 406 BUG_ON(ecc > USHRT_MAX);
407 407
408 bc->bc_crc32e = cpu_to_le32(crc); 408 bc->bc_crc32e = cpu_to_le32(crc);
409 bc->bc_ecc = cpu_to_le16((u16)ecc); 409 bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
509 * larger than 16 bits. 509 * larger than 16 bits.
510 */ 510 */
511 BUG_ON(ecc > USHORT_MAX); 511 BUG_ON(ecc > USHRT_MAX);
512 512
513 bc->bc_crc32e = cpu_to_le32(crc); 513 bc->bc_crc32e = cpu_to_le32(crc);
514 bc->bc_ecc = cpu_to_le16((u16)ecc); 514 bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 97e54b9e654b..6a13ea64c447 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
175 return 0; 175 return 0;
176} 176}
177 177
178static int ocfs2_sync_file(struct file *file, 178static int ocfs2_sync_file(struct file *file, int datasync)
179 struct dentry *dentry,
180 int datasync)
181{ 179{
182 int err = 0; 180 int err = 0;
183 journal_t *journal; 181 journal_t *journal;
184 struct inode *inode = dentry->d_inode; 182 struct dentry *dentry = file->f_path.dentry;
183 struct inode *inode = file->f_mapping->host;
185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
186 185
187 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 186 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -1053,7 +1052,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1053 } 1052 }
1054 1053
1055 /* 1054 /*
1056 * This will intentionally not wind up calling vmtruncate(), 1055 * This will intentionally not wind up calling simple_setsize(),
1057 * since all the work for a size change has been done above. 1056 * since all the work for a size change has been done above.
1058 * Otherwise, we could get into problems with truncate as 1057 * Otherwise, we could get into problems with truncate as
1059 * ip_alloc_sem is used there to protect against i_size 1058 * ip_alloc_sem is used there to protect against i_size
@@ -2119,9 +2118,13 @@ relock:
2119 * direct write may have instantiated a few 2118 * direct write may have instantiated a few
2120 * blocks outside i_size. Trim these off again. 2119 * blocks outside i_size. Trim these off again.
2121 * Don't need i_size_read because we hold i_mutex. 2120 * Don't need i_size_read because we hold i_mutex.
2121 *
2122 * XXX(hch): this looks buggy because ocfs2 did not
2123 * actually implement ->truncate. Take a look at
2124 * the new truncate sequence and update this accordingly
2122 */ 2125 */
2123 if (*ppos + count > inode->i_size) 2126 if (*ppos + count > inode->i_size)
2124 vmtruncate(inode, inode->i_size); 2127 simple_setsize(inode, inode->i_size);
2125 ret = written; 2128 ret = written;
2126 goto out_dio; 2129 goto out_dio;
2127 } 2130 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2c26ce251cb3..0eaa929a4dbf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -879,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
879 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 879 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
880 continue; 880 continue;
881 if (unsuspend) 881 if (unsuspend)
882 status = vfs_quota_enable( 882 status = dquot_resume(sb, type);
883 sb_dqopt(sb)->files[type], 883 else {
884 type, QFMT_OCFS2, 884 struct ocfs2_mem_dqinfo *oinfo;
885 DQUOT_SUSPENDED); 885
886 else 886 /* Cancel periodic syncing before suspending */
887 status = vfs_quota_disable(sb, type, 887 oinfo = sb_dqinfo(sb, type)->dqi_priv;
888 DQUOT_SUSPENDED); 888 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
889 status = dquot_suspend(sb, type);
890 }
889 if (status < 0) 891 if (status < 0)
890 break; 892 break;
891 } 893 }
@@ -916,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
916 status = -ENOENT; 918 status = -ENOENT;
917 goto out_quota_off; 919 goto out_quota_off;
918 } 920 }
919 status = vfs_quota_enable(inode[type], type, QFMT_OCFS2, 921 status = dquot_enable(inode[type], type, QFMT_OCFS2,
920 DQUOT_USAGE_ENABLED); 922 DQUOT_USAGE_ENABLED);
921 if (status < 0) 923 if (status < 0)
922 goto out_quota_off; 924 goto out_quota_off;
923 } 925 }
@@ -952,8 +954,8 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
952 /* Turn off quotas. This will remove all dquot structures from 954 /* Turn off quotas. This will remove all dquot structures from
953 * memory and so they will be automatically synced to global 955 * memory and so they will be automatically synced to global
954 * quota files */ 956 * quota files */
955 vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED | 957 dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
956 DQUOT_LIMITS_ENABLED); 958 DQUOT_LIMITS_ENABLED);
957 if (!inode) 959 if (!inode)
958 continue; 960 continue;
959 iput(inode); 961 iput(inode);
@@ -962,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
962 964
963/* Handle quota on quotactl */ 965/* Handle quota on quotactl */
964static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, 966static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
965 char *path, int remount) 967 char *path)
966{ 968{
967 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 969 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
968 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 970 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -970,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
970 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 972 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
971 return -EINVAL; 973 return -EINVAL;
972 974
973 if (remount) 975 return dquot_enable(sb_dqopt(sb)->files[type], type,
974 return 0; /* Just ignore it has been handled in 976 format_id, DQUOT_LIMITS_ENABLED);
975 * ocfs2_remount() */
976 return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
977 format_id, DQUOT_LIMITS_ENABLED);
978} 977}
979 978
980/* Handle quota off quotactl */ 979/* Handle quota off quotactl */
981static int ocfs2_quota_off(struct super_block *sb, int type, int remount) 980static int ocfs2_quota_off(struct super_block *sb, int type)
982{ 981{
983 if (remount) 982 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
984 return 0; /* Ignore now and handle later in
985 * ocfs2_remount() */
986 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
987} 983}
988 984
989static const struct quotactl_ops ocfs2_quotactl_ops = { 985static const struct quotactl_ops ocfs2_quotactl_ops = {
990 .quota_on = ocfs2_quota_on, 986 .quota_on = ocfs2_quota_on,
991 .quota_off = ocfs2_quota_off, 987 .quota_off = ocfs2_quota_off,
992 .quota_sync = vfs_quota_sync, 988 .quota_sync = dquot_quota_sync,
993 .get_info = vfs_get_dqinfo, 989 .get_info = dquot_get_dqinfo,
994 .set_info = vfs_set_dqinfo, 990 .set_info = dquot_set_dqinfo,
995 .get_dqblk = vfs_get_dqblk, 991 .get_dqblk = dquot_get_dqblk,
996 .set_dqblk = vfs_set_dqblk, 992 .set_dqblk = dquot_set_dqblk,
997}; 993};
998 994
999static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 995static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..6e7a3291bbe8 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = {
329 .aio_read = generic_file_aio_read, 329 .aio_read = generic_file_aio_read,
330 .aio_write = generic_file_aio_write, 330 .aio_write = generic_file_aio_write,
331 .mmap = generic_file_mmap, 331 .mmap = generic_file_mmap,
332 .fsync = simple_fsync, 332 .fsync = generic_file_fsync,
333 .splice_read = generic_file_splice_read, 333 .splice_read = generic_file_splice_read,
334}; 334};
335 335
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 3ceca05b668c..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/stringify.h> 28#include <linux/stringify.h>
29#include <linux/kernel.h>
29#include "ldm.h" 30#include "ldm.h"
30#include "check.h" 31#include "check.h"
31#include "msdos.h" 32#include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
77 int h; 78 int h;
78 79
79 /* high part */ 80 /* high part */
80 if ((x = src[0] - '0') <= '9'-'0') h = x; 81 x = h = hex_to_bin(src[0]);
81 else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; 82 if (h < 0)
82 else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; 83 return -1;
83 else return -1;
84 h <<= 4;
85 84
86 /* low part */ 85 /* low part */
87 if ((x = src[1] - '0') <= '9'-'0') return h | x; 86 h = hex_to_bin(src[1]);
88 if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); 87 if (h < 0)
89 if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); 88 return -1;
90 return -1; 89
90 return (x << 4) + h;
91} 91}
92 92
93/** 93/**
diff --git a/fs/pipe.c b/fs/pipe.c
index bdd3f96054b9..541d6626f9d9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -230,6 +230,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
230 230
231 return kmap(buf->page); 231 return kmap(buf->page);
232} 232}
233EXPORT_SYMBOL(generic_pipe_buf_map);
233 234
234/** 235/**
235 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 236 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -249,6 +250,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
249 } else 250 } else
250 kunmap(buf->page); 251 kunmap(buf->page);
251} 252}
253EXPORT_SYMBOL(generic_pipe_buf_unmap);
252 254
253/** 255/**
254 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 256 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -279,6 +281,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
279 281
280 return 1; 282 return 1;
281} 283}
284EXPORT_SYMBOL(generic_pipe_buf_steal);
282 285
283/** 286/**
284 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 287 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -294,6 +297,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
294{ 297{
295 page_cache_get(buf->page); 298 page_cache_get(buf->page);
296} 299}
300EXPORT_SYMBOL(generic_pipe_buf_get);
297 301
298/** 302/**
299 * generic_pipe_buf_confirm - verify contents of the pipe buffer 303 * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -309,6 +313,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
309{ 313{
310 return 0; 314 return 0;
311} 315}
316EXPORT_SYMBOL(generic_pipe_buf_confirm);
312 317
313/** 318/**
314 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 319 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -323,6 +328,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
323{ 328{
324 page_cache_release(buf->page); 329 page_cache_release(buf->page);
325} 330}
331EXPORT_SYMBOL(generic_pipe_buf_release);
326 332
327static const struct pipe_buf_operations anon_pipe_buf_ops = { 333static const struct pipe_buf_operations anon_pipe_buf_ops = {
328 .can_merge = 1, 334 .can_merge = 1,
@@ -1172,16 +1178,20 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1172 nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT; 1178 nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT;
1173 nr_pages = roundup_pow_of_two(nr_pages); 1179 nr_pages = roundup_pow_of_two(nr_pages);
1174 1180
1175 if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) 1181 if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) {
1176 return -EPERM; 1182 ret = -EPERM;
1183 goto out;
1184 }
1177 1185
1178 /* 1186 /*
1179 * The pipe needs to be at least 2 pages large to 1187 * The pipe needs to be at least 2 pages large to
1180 * guarantee POSIX behaviour. 1188 * guarantee POSIX behaviour.
1181 */ 1189 */
1182 if (nr_pages < 2) 1190 if (arg < 2) {
1183 return -EINVAL; 1191 ret = -EINVAL;
1184 ret = pipe_set_size(pipe, nr_pages); 1192 goto out;
1193 }
1194 ret = pipe_set_size(pipe, arg);
1185 break; 1195 break;
1186 } 1196 }
1187 case F_GETPIPE_SZ: 1197 case F_GETPIPE_SZ:
@@ -1192,6 +1202,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1192 break; 1202 break;
1193 } 1203 }
1194 1204
1205out:
1195 mutex_unlock(&pipe->inode->i_mutex); 1206 mutex_unlock(&pipe->inode->i_mutex);
1196 return ret; 1207 return ret;
1197} 1208}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..9b58d38bc911 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
267 shpending = p->signal->shared_pending.signal; 267 shpending = p->signal->shared_pending.signal;
268 blocked = p->blocked; 268 blocked = p->blocked;
269 collect_sigign_sigcatch(p, &ignored, &caught); 269 collect_sigign_sigcatch(p, &ignored, &caught);
270 num_threads = atomic_read(&p->signal->count); 270 num_threads = get_nr_threads(p);
271 rcu_read_lock(); /* FIXME: is this correct? */ 271 rcu_read_lock(); /* FIXME: is this correct? */
272 qsize = atomic_read(&__task_cred(p)->user->sigpending); 272 qsize = atomic_read(&__task_cred(p)->user->sigpending);
273 rcu_read_unlock(); 273 rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
410 tty_nr = new_encode_dev(tty_devnum(sig->tty)); 410 tty_nr = new_encode_dev(tty_devnum(sig->tty));
411 } 411 }
412 412
413 num_threads = atomic_read(&sig->count); 413 num_threads = get_nr_threads(task);
414 collect_sigign_sigcatch(task, &sigign, &sigcatch); 414 collect_sigign_sigcatch(task, &sigign, &sigcatch);
415 415
416 cmin_flt = sig->cmin_flt; 416 cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f9f23449dc..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
166 return result; 166 return result;
167} 167}
168 168
169static int get_nr_threads(struct task_struct *tsk)
170{
171 unsigned long flags;
172 int count = 0;
173
174 if (lock_task_sighand(tsk, &flags)) {
175 count = atomic_read(&tsk->signal->count);
176 unlock_task_sighand(tsk, &flags);
177 }
178 return count;
179}
180
181static int proc_cwd_link(struct inode *inode, struct path *path) 169static int proc_cwd_link(struct inode *inode, struct path *path)
182{ 170{
183 struct task_struct *task = get_proc_task(inode); 171 struct task_struct *task = get_proc_task(inode);
@@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2444 const struct pid_entry *p = ptr; 2432 const struct pid_entry *p = ptr;
2445 struct inode *inode; 2433 struct inode *inode;
2446 struct proc_inode *ei; 2434 struct proc_inode *ei;
2447 struct dentry *error = ERR_PTR(-EINVAL); 2435 struct dentry *error;
2448 2436
2449 /* Allocate the inode */ 2437 /* Allocate the inode */
2450 error = ERR_PTR(-ENOMEM); 2438 error = ERR_PTR(-ENOMEM);
@@ -2794,7 +2782,7 @@ out:
2794 2782
2795struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 2783struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2796{ 2784{
2797 struct dentry *result = ERR_PTR(-ENOENT); 2785 struct dentry *result;
2798 struct task_struct *task; 2786 struct task_struct *task;
2799 unsigned tgid; 2787 unsigned tgid;
2800 struct pid_namespace *ns; 2788 struct pid_namespace *ns;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
343/* 343/*
344 * Return an inode number between PROC_DYNAMIC_FIRST and 344 * Return an inode number between PROC_DYNAMIC_FIRST and
345 * 0xffffffff, or zero on failure. 345 * 0xffffffff, or zero on failure.
346 *
347 * Current inode allocations in the proc-fs (hex-numbers):
348 *
349 * 00000000 reserved
350 * 00000001-00000fff static entries (goners)
351 * 001 root-ino
352 *
353 * 00001000-00001fff unused
354 * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
355 * 80000000-efffffff unused
356 * f0000000-ffffffff dynamic entries
357 *
358 * Goal:
359 * Once we split the thing into several virtual filesystems,
360 * we will get rid of magical ranges (and this comment, BTW).
361 */ 346 */
362static unsigned int get_inode_number(void) 347static unsigned int get_inode_number(void)
363{ 348{
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c837a77351be..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -588,7 +588,7 @@ static struct kcore_list kcore_text;
588 */ 588 */
589static void __init proc_kcore_text_init(void) 589static void __init proc_kcore_text_init(void)
590{ 590{
591 kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); 591 kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
592} 592}
593#else 593#else
594static void __init proc_kcore_text_init(void) 594static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
110 if (err) 110 if (err)
111 return; 111 return;
112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); 112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
113 err = PTR_ERR(proc_mnt);
114 if (IS_ERR(proc_mnt)) { 113 if (IS_ERR(proc_mnt)) {
115 unregister_filesystem(&proc_fs_type); 114 unregister_filesystem(&proc_fs_type);
116 return; 115 return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56e..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
634 return err; 634 return err;
635} 635}
636 636
637#ifdef CONFIG_HUGETLB_PAGE
637static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) 638static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
638{ 639{
639 u64 pme = 0; 640 u64 pme = 0;
@@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
664 665
665 return err; 666 return err;
666} 667}
668#endif /* HUGETLB_PAGE */
667 669
668/* 670/*
669 * /proc/pid/pagemap - an array mapping virtual pages to pfns 671 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
733 735
734 pagemap_walk.pmd_entry = pagemap_pte_range; 736 pagemap_walk.pmd_entry = pagemap_pte_range;
735 pagemap_walk.pte_hole = pagemap_pte_hole; 737 pagemap_walk.pte_hole = pagemap_pte_hole;
738#ifdef CONFIG_HUGETLB_PAGE
736 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 739 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
740#endif
737 pagemap_walk.mm = mm; 741 pagemap_walk.mm = mm;
738 pagemap_walk.private = &pm; 742 pagemap_walk.private = &pm;
739 743
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
77 77
78const struct file_operations qnx4_dir_operations = 78const struct file_operations qnx4_dir_operations =
79{ 79{
80 .llseek = generic_file_llseek,
80 .read = generic_read_dir, 81 .read = generic_read_dir,
81 .readdir = qnx4_readdir, 82 .readdir = qnx4_readdir,
82 .fsync = simple_fsync, 83 .fsync = generic_file_fsync,
83}; 84};
84 85
85const struct inode_operations qnx4_dir_inode_operations = 86const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 655a4c52b8c3..12c233da1b6b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -228,10 +228,6 @@ static struct hlist_head *dquot_hash;
228 228
229struct dqstats dqstats; 229struct dqstats dqstats;
230EXPORT_SYMBOL(dqstats); 230EXPORT_SYMBOL(dqstats);
231#ifdef CONFIG_SMP
232struct dqstats *dqstats_pcpu;
233EXPORT_SYMBOL(dqstats_pcpu);
234#endif
235 231
236static qsize_t inode_get_rsv_space(struct inode *inode); 232static qsize_t inode_get_rsv_space(struct inode *inode);
237static void __dquot_initialize(struct inode *inode, int type); 233static void __dquot_initialize(struct inode *inode, int type);
@@ -584,7 +580,7 @@ out:
584} 580}
585EXPORT_SYMBOL(dquot_scan_active); 581EXPORT_SYMBOL(dquot_scan_active);
586 582
587int vfs_quota_sync(struct super_block *sb, int type, int wait) 583int dquot_quota_sync(struct super_block *sb, int type, int wait)
588{ 584{
589 struct list_head *dirty; 585 struct list_head *dirty;
590 struct dquot *dquot; 586 struct dquot *dquot;
@@ -656,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
656 652
657 return 0; 653 return 0;
658} 654}
659EXPORT_SYMBOL(vfs_quota_sync); 655EXPORT_SYMBOL(dquot_quota_sync);
660 656
661/* Free unused dquots from cache */ 657/* Free unused dquots from cache */
662static void prune_dqcache(int count) 658static void prune_dqcache(int count)
@@ -676,27 +672,10 @@ static void prune_dqcache(int count)
676 } 672 }
677} 673}
678 674
679static int dqstats_read(unsigned int type)
680{
681 int count = 0;
682#ifdef CONFIG_SMP
683 int cpu;
684 for_each_possible_cpu(cpu)
685 count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type];
686 /* Statistics reading is racy, but absolute accuracy isn't required */
687 if (count < 0)
688 count = 0;
689#else
690 count = dqstats.stat[type];
691#endif
692 return count;
693}
694
695/* 675/*
696 * This is called from kswapd when we think we need some 676 * This is called from kswapd when we think we need some
697 * more memory 677 * more memory
698 */ 678 */
699
700static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) 679static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
701{ 680{
702 if (nr) { 681 if (nr) {
@@ -704,7 +683,9 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
704 prune_dqcache(nr); 683 prune_dqcache(nr);
705 spin_unlock(&dq_list_lock); 684 spin_unlock(&dq_list_lock);
706 } 685 }
707 return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure; 686 return ((unsigned)
687 percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
688 /100) * sysctl_vfs_cache_pressure;
708} 689}
709 690
710static struct shrinker dqcache_shrinker = { 691static struct shrinker dqcache_shrinker = {
@@ -1514,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1514/* 1495/*
1515 * This operation can block, but only after everything is updated 1496 * This operation can block, but only after everything is updated
1516 */ 1497 */
1517int __dquot_alloc_space(struct inode *inode, qsize_t number, 1498int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1518 int warn, int reserve)
1519{ 1499{
1520 int cnt, ret = 0; 1500 int cnt, ret = 0;
1521 char warntype[MAXQUOTAS]; 1501 char warntype[MAXQUOTAS];
1502 int warn = flags & DQUOT_SPACE_WARN;
1503 int reserve = flags & DQUOT_SPACE_RESERVE;
1504 int nofail = flags & DQUOT_SPACE_NOFAIL;
1522 1505
1523 /* 1506 /*
1524 * First test before acquiring mutex - solves deadlocks when we 1507 * First test before acquiring mutex - solves deadlocks when we
@@ -1539,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1539 continue; 1522 continue;
1540 ret = check_bdq(inode->i_dquot[cnt], number, !warn, 1523 ret = check_bdq(inode->i_dquot[cnt], number, !warn,
1541 warntype+cnt); 1524 warntype+cnt);
1542 if (ret) { 1525 if (ret && !nofail) {
1543 spin_unlock(&dq_data_lock); 1526 spin_unlock(&dq_data_lock);
1544 goto out_flush_warn; 1527 goto out_flush_warn;
1545 } 1528 }
@@ -1638,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
1638/* 1621/*
1639 * This operation can block, but only after everything is updated 1622 * This operation can block, but only after everything is updated
1640 */ 1623 */
1641void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) 1624void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1642{ 1625{
1643 unsigned int cnt; 1626 unsigned int cnt;
1644 char warntype[MAXQUOTAS]; 1627 char warntype[MAXQUOTAS];
1628 int reserve = flags & DQUOT_SPACE_RESERVE;
1645 1629
1646 /* First test before acquiring mutex - solves deadlocks when we 1630 /* First test before acquiring mutex - solves deadlocks when we
1647 * re-enter the quota code and are already holding the mutex */ 1631 * re-enter the quota code and are already holding the mutex */
@@ -1812,7 +1796,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1812 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) 1796 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
1813 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA); 1797 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
1814 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) 1798 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
1815 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA); 1799 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
1816 1800
1817 ret = __dquot_transfer(inode, transfer_to); 1801 ret = __dquot_transfer(inode, transfer_to);
1818 dqput_all(transfer_to); 1802 dqput_all(transfer_to);
@@ -1847,6 +1831,7 @@ const struct dquot_operations dquot_operations = {
1847 .alloc_dquot = dquot_alloc, 1831 .alloc_dquot = dquot_alloc,
1848 .destroy_dquot = dquot_destroy, 1832 .destroy_dquot = dquot_destroy,
1849}; 1833};
1834EXPORT_SYMBOL(dquot_operations);
1850 1835
1851/* 1836/*
1852 * Generic helper for ->open on filesystems supporting disk quotas. 1837 * Generic helper for ->open on filesystems supporting disk quotas.
@@ -1865,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open);
1865/* 1850/*
1866 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1851 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1867 */ 1852 */
1868int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) 1853int dquot_disable(struct super_block *sb, int type, unsigned int flags)
1869{ 1854{
1870 int cnt, ret = 0; 1855 int cnt, ret = 0;
1871 struct quota_info *dqopt = sb_dqopt(sb); 1856 struct quota_info *dqopt = sb_dqopt(sb);
@@ -1995,14 +1980,15 @@ put_inodes:
1995 } 1980 }
1996 return ret; 1981 return ret;
1997} 1982}
1998EXPORT_SYMBOL(vfs_quota_disable); 1983EXPORT_SYMBOL(dquot_disable);
1999 1984
2000int vfs_quota_off(struct super_block *sb, int type, int remount) 1985int dquot_quota_off(struct super_block *sb, int type)
2001{ 1986{
2002 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : 1987 return dquot_disable(sb, type,
2003 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); 1988 DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
2004} 1989}
2005EXPORT_SYMBOL(vfs_quota_off); 1990EXPORT_SYMBOL(dquot_quota_off);
1991
2006/* 1992/*
2007 * Turn quotas on on a device 1993 * Turn quotas on on a device
2008 */ 1994 */
@@ -2120,36 +2106,43 @@ out_fmt:
2120} 2106}
2121 2107
2122/* Reenable quotas on remount RW */ 2108/* Reenable quotas on remount RW */
2123static int vfs_quota_on_remount(struct super_block *sb, int type) 2109int dquot_resume(struct super_block *sb, int type)
2124{ 2110{
2125 struct quota_info *dqopt = sb_dqopt(sb); 2111 struct quota_info *dqopt = sb_dqopt(sb);
2126 struct inode *inode; 2112 struct inode *inode;
2127 int ret; 2113 int ret = 0, cnt;
2128 unsigned int flags; 2114 unsigned int flags;
2129 2115
2130 mutex_lock(&dqopt->dqonoff_mutex); 2116 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2131 if (!sb_has_quota_suspended(sb, type)) { 2117 if (type != -1 && cnt != type)
2118 continue;
2119
2120 mutex_lock(&dqopt->dqonoff_mutex);
2121 if (!sb_has_quota_suspended(sb, cnt)) {
2122 mutex_unlock(&dqopt->dqonoff_mutex);
2123 continue;
2124 }
2125 inode = dqopt->files[cnt];
2126 dqopt->files[cnt] = NULL;
2127 spin_lock(&dq_state_lock);
2128 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2129 DQUOT_LIMITS_ENABLED,
2130 cnt);
2131 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
2132 spin_unlock(&dq_state_lock);
2132 mutex_unlock(&dqopt->dqonoff_mutex); 2133 mutex_unlock(&dqopt->dqonoff_mutex);
2133 return 0;
2134 }
2135 inode = dqopt->files[type];
2136 dqopt->files[type] = NULL;
2137 spin_lock(&dq_state_lock);
2138 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2139 DQUOT_LIMITS_ENABLED, type);
2140 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
2141 spin_unlock(&dq_state_lock);
2142 mutex_unlock(&dqopt->dqonoff_mutex);
2143 2134
2144 flags = dquot_generic_flag(flags, type); 2135 flags = dquot_generic_flag(flags, cnt);
2145 ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id, 2136 ret = vfs_load_quota_inode(inode, cnt,
2146 flags); 2137 dqopt->info[cnt].dqi_fmt_id, flags);
2147 iput(inode); 2138 iput(inode);
2139 }
2148 2140
2149 return ret; 2141 return ret;
2150} 2142}
2143EXPORT_SYMBOL(dquot_resume);
2151 2144
2152int vfs_quota_on_path(struct super_block *sb, int type, int format_id, 2145int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
2153 struct path *path) 2146 struct path *path)
2154{ 2147{
2155 int error = security_quota_on(path->dentry); 2148 int error = security_quota_on(path->dentry);
@@ -2164,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
2164 DQUOT_LIMITS_ENABLED); 2157 DQUOT_LIMITS_ENABLED);
2165 return error; 2158 return error;
2166} 2159}
2167EXPORT_SYMBOL(vfs_quota_on_path); 2160EXPORT_SYMBOL(dquot_quota_on_path);
2168 2161
2169int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, 2162int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
2170 int remount)
2171{ 2163{
2172 struct path path; 2164 struct path path;
2173 int error; 2165 int error;
2174 2166
2175 if (remount)
2176 return vfs_quota_on_remount(sb, type);
2177
2178 error = kern_path(name, LOOKUP_FOLLOW, &path); 2167 error = kern_path(name, LOOKUP_FOLLOW, &path);
2179 if (!error) { 2168 if (!error) {
2180 error = vfs_quota_on_path(sb, type, format_id, &path); 2169 error = dquot_quota_on_path(sb, type, format_id, &path);
2181 path_put(&path); 2170 path_put(&path);
2182 } 2171 }
2183 return error; 2172 return error;
2184} 2173}
2185EXPORT_SYMBOL(vfs_quota_on); 2174EXPORT_SYMBOL(dquot_quota_on);
2186 2175
2187/* 2176/*
2188 * More powerful function for turning on quotas allowing setting 2177 * More powerful function for turning on quotas allowing setting
2189 * of individual quota flags 2178 * of individual quota flags
2190 */ 2179 */
2191int vfs_quota_enable(struct inode *inode, int type, int format_id, 2180int dquot_enable(struct inode *inode, int type, int format_id,
2192 unsigned int flags) 2181 unsigned int flags)
2193{ 2182{
2194 int ret = 0; 2183 int ret = 0;
2195 struct super_block *sb = inode->i_sb; 2184 struct super_block *sb = inode->i_sb;
2196 struct quota_info *dqopt = sb_dqopt(sb); 2185 struct quota_info *dqopt = sb_dqopt(sb);
2197 2186
2198 /* Just unsuspend quotas? */ 2187 /* Just unsuspend quotas? */
2199 if (flags & DQUOT_SUSPENDED) 2188 BUG_ON(flags & DQUOT_SUSPENDED);
2200 return vfs_quota_on_remount(sb, type); 2189
2201 if (!flags) 2190 if (!flags)
2202 return 0; 2191 return 0;
2203 /* Just updating flags needed? */ 2192 /* Just updating flags needed? */
@@ -2229,13 +2218,13 @@ out_lock:
2229load_quota: 2218load_quota:
2230 return vfs_load_quota_inode(inode, type, format_id, flags); 2219 return vfs_load_quota_inode(inode, type, format_id, flags);
2231} 2220}
2232EXPORT_SYMBOL(vfs_quota_enable); 2221EXPORT_SYMBOL(dquot_enable);
2233 2222
2234/* 2223/*
2235 * This function is used when filesystem needs to initialize quotas 2224 * This function is used when filesystem needs to initialize quotas
2236 * during mount time. 2225 * during mount time.
2237 */ 2226 */
2238int vfs_quota_on_mount(struct super_block *sb, char *qf_name, 2227int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
2239 int format_id, int type) 2228 int format_id, int type)
2240{ 2229{
2241 struct dentry *dentry; 2230 struct dentry *dentry;
@@ -2261,24 +2250,7 @@ out:
2261 dput(dentry); 2250 dput(dentry);
2262 return error; 2251 return error;
2263} 2252}
2264EXPORT_SYMBOL(vfs_quota_on_mount); 2253EXPORT_SYMBOL(dquot_quota_on_mount);
2265
2266/* Wrapper to turn on quotas when remounting rw */
2267int vfs_dq_quota_on_remount(struct super_block *sb)
2268{
2269 int cnt;
2270 int ret = 0, err;
2271
2272 if (!sb->s_qcop || !sb->s_qcop->quota_on)
2273 return -ENOSYS;
2274 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2275 err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
2276 if (err < 0 && !ret)
2277 ret = err;
2278 }
2279 return ret;
2280}
2281EXPORT_SYMBOL(vfs_dq_quota_on_remount);
2282 2254
2283static inline qsize_t qbtos(qsize_t blocks) 2255static inline qsize_t qbtos(qsize_t blocks)
2284{ 2256{
@@ -2313,8 +2285,8 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2313 spin_unlock(&dq_data_lock); 2285 spin_unlock(&dq_data_lock);
2314} 2286}
2315 2287
2316int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, 2288int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
2317 struct fs_disk_quota *di) 2289 struct fs_disk_quota *di)
2318{ 2290{
2319 struct dquot *dquot; 2291 struct dquot *dquot;
2320 2292
@@ -2326,7 +2298,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2326 2298
2327 return 0; 2299 return 0;
2328} 2300}
2329EXPORT_SYMBOL(vfs_get_dqblk); 2301EXPORT_SYMBOL(dquot_get_dqblk);
2330 2302
2331#define VFS_FS_DQ_MASK \ 2303#define VFS_FS_DQ_MASK \
2332 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \ 2304 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
@@ -2425,7 +2397,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2425 return 0; 2397 return 0;
2426} 2398}
2427 2399
2428int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, 2400int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
2429 struct fs_disk_quota *di) 2401 struct fs_disk_quota *di)
2430{ 2402{
2431 struct dquot *dquot; 2403 struct dquot *dquot;
@@ -2441,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
2441out: 2413out:
2442 return rc; 2414 return rc;
2443} 2415}
2444EXPORT_SYMBOL(vfs_set_dqblk); 2416EXPORT_SYMBOL(dquot_set_dqblk);
2445 2417
2446/* Generic routine for getting common part of quota file information */ 2418/* Generic routine for getting common part of quota file information */
2447int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2419int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2448{ 2420{
2449 struct mem_dqinfo *mi; 2421 struct mem_dqinfo *mi;
2450 2422
@@ -2463,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2463 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2435 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2464 return 0; 2436 return 0;
2465} 2437}
2466EXPORT_SYMBOL(vfs_get_dqinfo); 2438EXPORT_SYMBOL(dquot_get_dqinfo);
2467 2439
2468/* Generic routine for setting common part of quota file information */ 2440/* Generic routine for setting common part of quota file information */
2469int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2441int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2470{ 2442{
2471 struct mem_dqinfo *mi; 2443 struct mem_dqinfo *mi;
2472 int err = 0; 2444 int err = 0;
@@ -2493,27 +2465,27 @@ out:
2493 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2465 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2494 return err; 2466 return err;
2495} 2467}
2496EXPORT_SYMBOL(vfs_set_dqinfo); 2468EXPORT_SYMBOL(dquot_set_dqinfo);
2497 2469
2498const struct quotactl_ops vfs_quotactl_ops = { 2470const struct quotactl_ops dquot_quotactl_ops = {
2499 .quota_on = vfs_quota_on, 2471 .quota_on = dquot_quota_on,
2500 .quota_off = vfs_quota_off, 2472 .quota_off = dquot_quota_off,
2501 .quota_sync = vfs_quota_sync, 2473 .quota_sync = dquot_quota_sync,
2502 .get_info = vfs_get_dqinfo, 2474 .get_info = dquot_get_dqinfo,
2503 .set_info = vfs_set_dqinfo, 2475 .set_info = dquot_set_dqinfo,
2504 .get_dqblk = vfs_get_dqblk, 2476 .get_dqblk = dquot_get_dqblk,
2505 .set_dqblk = vfs_set_dqblk 2477 .set_dqblk = dquot_set_dqblk
2506}; 2478};
2507 2479EXPORT_SYMBOL(dquot_quotactl_ops);
2508 2480
2509static int do_proc_dqstats(struct ctl_table *table, int write, 2481static int do_proc_dqstats(struct ctl_table *table, int write,
2510 void __user *buffer, size_t *lenp, loff_t *ppos) 2482 void __user *buffer, size_t *lenp, loff_t *ppos)
2511{ 2483{
2512#ifdef CONFIG_SMP
2513 /* Update global table */
2514 unsigned int type = (int *)table->data - dqstats.stat; 2484 unsigned int type = (int *)table->data - dqstats.stat;
2515 dqstats.stat[type] = dqstats_read(type); 2485
2516#endif 2486 /* Update global table */
2487 dqstats.stat[type] =
2488 percpu_counter_sum_positive(&dqstats.counter[type]);
2517 return proc_dointvec(table, write, buffer, lenp, ppos); 2489 return proc_dointvec(table, write, buffer, lenp, ppos);
2518} 2490}
2519 2491
@@ -2606,7 +2578,7 @@ static ctl_table sys_table[] = {
2606 2578
2607static int __init dquot_init(void) 2579static int __init dquot_init(void)
2608{ 2580{
2609 int i; 2581 int i, ret;
2610 unsigned long nr_hash, order; 2582 unsigned long nr_hash, order;
2611 2583
2612 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); 2584 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2624,12 +2596,11 @@ static int __init dquot_init(void)
2624 if (!dquot_hash) 2596 if (!dquot_hash)
2625 panic("Cannot create dquot hash table"); 2597 panic("Cannot create dquot hash table");
2626 2598
2627#ifdef CONFIG_SMP 2599 for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
2628 dqstats_pcpu = alloc_percpu(struct dqstats); 2600 ret = percpu_counter_init(&dqstats.counter[i], 0);
2629 if (!dqstats_pcpu) 2601 if (ret)
2630 panic("Cannot create dquot stats table"); 2602 panic("Cannot create dquot stat counters");
2631#endif 2603 }
2632 memset(&dqstats, 0, sizeof(struct dqstats));
2633 2604
2634 /* Find power-of-two hlist_heads which can fit into allocation */ 2605 /* Find power-of-two hlist_heads which can fit into allocation */
2635 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); 2606 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ce3dfd066f59..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -73,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
73 if (IS_ERR(pathname)) 73 if (IS_ERR(pathname))
74 return PTR_ERR(pathname); 74 return PTR_ERR(pathname);
75 if (sb->s_qcop->quota_on) 75 if (sb->s_qcop->quota_on)
76 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); 76 ret = sb->s_qcop->quota_on(sb, type, id, pathname);
77 putname(pathname); 77 putname(pathname);
78 return ret; 78 return ret;
79} 79}
@@ -260,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
260 case Q_QUOTAOFF: 260 case Q_QUOTAOFF:
261 if (!sb->s_qcop->quota_off) 261 if (!sb->s_qcop->quota_off)
262 return -ENOSYS; 262 return -ENOSYS;
263 return sb->s_qcop->quota_off(sb, type, 0); 263 return sb->s_qcop->quota_off(sb, type);
264 case Q_GETFMT: 264 case Q_GETFMT:
265 return quota_getfmt(sb, type, addr); 265 return quota_getfmt(sb, type, addr);
266 case Q_GETINFO: 266 case Q_GETINFO:
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
43 .write = do_sync_write, 43 .write = do_sync_write,
44 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
45 .mmap = generic_file_mmap, 45 .mmap = generic_file_mmap,
46 .fsync = simple_sync_file, 46 .fsync = noop_fsync,
47 .splice_read = generic_file_splice_read, 47 .splice_read = generic_file_splice_read,
48 .splice_write = generic_file_splice_write, 48 .splice_write = generic_file_splice_write,
49 .llseek = generic_file_llseek, 49 .llseek = generic_file_llseek,
50}; 50};
51 51
52const struct inode_operations ramfs_file_inode_operations = { 52const struct inode_operations ramfs_file_inode_operations = {
53 .setattr = simple_setattr,
53 .getattr = simple_getattr, 54 .getattr = simple_getattr,
54}; 55};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5ea4ad81a429..d532c20fc179 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
42 .aio_read = generic_file_aio_read, 42 .aio_read = generic_file_aio_read,
43 .write = do_sync_write, 43 .write = do_sync_write,
44 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
45 .fsync = simple_sync_file, 45 .fsync = noop_fsync,
46 .splice_read = generic_file_splice_read, 46 .splice_read = generic_file_splice_read,
47 .splice_write = generic_file_splice_write, 47 .splice_write = generic_file_splice_write,
48 .llseek = generic_file_llseek, 48 .llseek = generic_file_llseek,
@@ -146,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
146 return ret; 146 return ret;
147 } 147 }
148 148
149 ret = vmtruncate(inode, newsize); 149 ret = simple_setsize(inode, newsize);
150 150
151 return ret; 151 return ret;
152} 152}
@@ -169,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
169 169
170 /* pick out size-changing events */ 170 /* pick out size-changing events */
171 if (ia->ia_valid & ATTR_SIZE) { 171 if (ia->ia_valid & ATTR_SIZE) {
172 loff_t size = i_size_read(inode); 172 loff_t size = inode->i_size;
173
173 if (ia->ia_size != size) { 174 if (ia->ia_size != size) {
174 ret = ramfs_nommu_resize(inode, ia->ia_size, size); 175 ret = ramfs_nommu_resize(inode, ia->ia_size, size);
175 if (ret < 0 || ia->ia_valid == ATTR_SIZE) 176 if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -182,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
182 } 183 }
183 } 184 }
184 185
185 ret = inode_setattr(inode, ia); 186 generic_setattr(inode, ia);
186 out: 187 out:
187 ia->ia_valid = old_ia_valid; 188 ia->ia_valid = old_ia_valid;
188 return ret; 189 return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
97} 97}
98EXPORT_SYMBOL(generic_file_llseek); 98EXPORT_SYMBOL(generic_file_llseek);
99 99
100/**
101 * noop_llseek - No Operation Performed llseek implementation
102 * @file: file structure to seek on
103 * @offset: file offset to seek to
104 * @origin: type of seek
105 *
106 * This is an implementation of ->llseek useable for the rare special case when
107 * userspace expects the seek to succeed but the (device) file is actually not
108 * able to perform the seek. In this case you use noop_llseek() instead of
109 * falling back to the default implementation of ->llseek.
110 */
111loff_t noop_llseek(struct file *file, loff_t offset, int origin)
112{
113 return file->f_pos;
114}
115EXPORT_SYMBOL(noop_llseek);
116
100loff_t no_llseek(struct file *file, loff_t offset, int origin) 117loff_t no_llseek(struct file *file, loff_t offset, int origin)
101{ 118{
102 return -ESPIPE; 119 return -ESPIPE;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,10 +14,10 @@
14extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
15 15
16static int reiserfs_readdir(struct file *, void *, filldir_t); 16static int reiserfs_readdir(struct file *, void *, filldir_t);
17static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 17static int reiserfs_dir_fsync(struct file *filp, int datasync);
18 int datasync);
19 18
20const struct file_operations reiserfs_dir_operations = { 19const struct file_operations reiserfs_dir_operations = {
20 .llseek = generic_file_llseek,
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .readdir = reiserfs_readdir, 22 .readdir = reiserfs_readdir,
23 .fsync = reiserfs_dir_fsync, 23 .fsync = reiserfs_dir_fsync,
@@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
27#endif 27#endif
28}; 28};
29 29
30static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 30static int reiserfs_dir_fsync(struct file *filp, int datasync)
31 int datasync)
32{ 31{
33 struct inode *inode = dentry->d_inode; 32 struct inode *inode = filp->f_mapping->host;
34 int err; 33 int err;
35 reiserfs_write_lock(inode->i_sb); 34 reiserfs_write_lock(inode->i_sb);
36 err = reiserfs_commit_for_inode(inode); 35 err = reiserfs_commit_for_inode(inode);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9977df9f3a54..b82cdd8a45dd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
134 * be removed... 134 * be removed...
135 */ 135 */
136 136
137static int reiserfs_sync_file(struct file *filp, 137static int reiserfs_sync_file(struct file *filp, int datasync)
138 struct dentry *dentry, int datasync)
139{ 138{
140 struct inode *inode = dentry->d_inode; 139 struct inode *inode = filp->f_mapping->host;
141 int err; 140 int err;
142 int barrier_done; 141 int barrier_done;
143 142
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 59125fb36d42..9822fa15118b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
158#ifdef CONFIG_QUOTA 158#ifdef CONFIG_QUOTA
159 int i; 159 int i;
160 int ms_active_set; 160 int ms_active_set;
161 int quota_enabled[MAXQUOTAS];
161#endif 162#endif
162 163
163 /* compose key to look for "save" links */ 164 /* compose key to look for "save" links */
@@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
179 } 180 }
180 /* Turn on quotas so that they are updated correctly */ 181 /* Turn on quotas so that they are updated correctly */
181 for (i = 0; i < MAXQUOTAS; i++) { 182 for (i = 0; i < MAXQUOTAS; i++) {
183 quota_enabled[i] = 1;
182 if (REISERFS_SB(s)->s_qf_names[i]) { 184 if (REISERFS_SB(s)->s_qf_names[i]) {
183 int ret = reiserfs_quota_on_mount(s, i); 185 int ret;
186
187 if (sb_has_quota_active(s, i)) {
188 quota_enabled[i] = 0;
189 continue;
190 }
191 ret = reiserfs_quota_on_mount(s, i);
184 if (ret < 0) 192 if (ret < 0)
185 reiserfs_warning(s, "reiserfs-2500", 193 reiserfs_warning(s, "reiserfs-2500",
186 "cannot turn on journaled " 194 "cannot turn on journaled "
@@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
304#ifdef CONFIG_QUOTA 312#ifdef CONFIG_QUOTA
305 /* Turn quotas off */ 313 /* Turn quotas off */
306 for (i = 0; i < MAXQUOTAS; i++) { 314 for (i = 0; i < MAXQUOTAS; i++) {
307 if (sb_dqopt(s)->files[i]) 315 if (sb_dqopt(s)->files[i] && quota_enabled[i])
308 vfs_quota_off(s, i, 0); 316 dquot_quota_off(s, i);
309 } 317 }
310 if (ms_active_set) 318 if (ms_active_set)
311 /* Restore the flag back */ 319 /* Restore the flag back */
@@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
466 struct reiserfs_transaction_handle th; 474 struct reiserfs_transaction_handle th;
467 th.t_trans_id = 0; 475 th.t_trans_id = 0;
468 476
477 dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
478
469 reiserfs_write_lock(s); 479 reiserfs_write_lock(s);
470 480
471 if (s->s_dirt) 481 if (s->s_dirt)
@@ -620,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
620static int reiserfs_release_dquot(struct dquot *); 630static int reiserfs_release_dquot(struct dquot *);
621static int reiserfs_mark_dquot_dirty(struct dquot *); 631static int reiserfs_mark_dquot_dirty(struct dquot *);
622static int reiserfs_write_info(struct super_block *, int); 632static int reiserfs_write_info(struct super_block *, int);
623static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 633static int reiserfs_quota_on(struct super_block *, int, int, char *);
624 634
625static const struct dquot_operations reiserfs_quota_operations = { 635static const struct dquot_operations reiserfs_quota_operations = {
626 .write_dquot = reiserfs_write_dquot, 636 .write_dquot = reiserfs_write_dquot,
@@ -634,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
634 644
635static const struct quotactl_ops reiserfs_qctl_operations = { 645static const struct quotactl_ops reiserfs_qctl_operations = {
636 .quota_on = reiserfs_quota_on, 646 .quota_on = reiserfs_quota_on,
637 .quota_off = vfs_quota_off, 647 .quota_off = dquot_quota_off,
638 .quota_sync = vfs_quota_sync, 648 .quota_sync = dquot_quota_sync,
639 .get_info = vfs_get_dqinfo, 649 .get_info = dquot_get_dqinfo,
640 .set_info = vfs_set_dqinfo, 650 .set_info = dquot_set_dqinfo,
641 .get_dqblk = vfs_get_dqblk, 651 .get_dqblk = dquot_get_dqblk,
642 .set_dqblk = vfs_set_dqblk, 652 .set_dqblk = dquot_set_dqblk,
643}; 653};
644#endif 654#endif
645 655
@@ -1242,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1242 if (s->s_flags & MS_RDONLY) 1252 if (s->s_flags & MS_RDONLY)
1243 /* it is read-only already */ 1253 /* it is read-only already */
1244 goto out_ok; 1254 goto out_ok;
1255
1256 err = dquot_suspend(s, -1);
1257 if (err < 0)
1258 goto out_err;
1259
1245 /* try to remount file system with read-only permissions */ 1260 /* try to remount file system with read-only permissions */
1246 if (sb_umount_state(rs) == REISERFS_VALID_FS 1261 if (sb_umount_state(rs) == REISERFS_VALID_FS
1247 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { 1262 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1295,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1295 s->s_dirt = 0; 1310 s->s_dirt = 0;
1296 1311
1297 if (!(*mount_flags & MS_RDONLY)) { 1312 if (!(*mount_flags & MS_RDONLY)) {
1313 dquot_resume(s, -1);
1298 finish_unfinished(s); 1314 finish_unfinished(s);
1299 reiserfs_xattr_init(s, *mount_flags); 1315 reiserfs_xattr_init(s, *mount_flags);
1300 } 1316 }
@@ -2022,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
2022 */ 2038 */
2023static int reiserfs_quota_on_mount(struct super_block *sb, int type) 2039static int reiserfs_quota_on_mount(struct super_block *sb, int type)
2024{ 2040{
2025 return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], 2041 return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
2026 REISERFS_SB(sb)->s_jquota_fmt, type); 2042 REISERFS_SB(sb)->s_jquota_fmt, type);
2027} 2043}
2028 2044
2029/* 2045/*
2030 * Standard function to be called on quota_on 2046 * Standard function to be called on quota_on
2031 */ 2047 */
2032static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, 2048static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2033 char *name, int remount) 2049 char *name)
2034{ 2050{
2035 int err; 2051 int err;
2036 struct path path; 2052 struct path path;
@@ -2039,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2039 2055
2040 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) 2056 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
2041 return -EINVAL; 2057 return -EINVAL;
2042 /* No more checks needed? Path and format_id are bogus anyway... */ 2058
2043 if (remount)
2044 return vfs_quota_on(sb, type, format_id, name, 1);
2045 err = kern_path(name, LOOKUP_FOLLOW, &path); 2059 err = kern_path(name, LOOKUP_FOLLOW, &path);
2046 if (err) 2060 if (err)
2047 return err; 2061 return err;
@@ -2085,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2085 if (err) 2099 if (err)
2086 goto out; 2100 goto out;
2087 } 2101 }
2088 err = vfs_quota_on_path(sb, type, format_id, &path); 2102 err = dquot_quota_on_path(sb, type, format_id, &path);
2089out: 2103out:
2090 path_put(&path); 2104 path_put(&path);
2091 return err; 2105 return err;
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
37 37
38const struct file_operations smb_dir_operations = 38const struct file_operations smb_dir_operations =
39{ 39{
40 .llseek = generic_file_llseek,
40 .read = generic_read_dir, 41 .read = generic_read_dir,
41 .readdir = smb_readdir, 42 .readdir = smb_readdir,
42 .ioctl = smb_ioctl, 43 .unlocked_ioctl = smb_ioctl,
43 .open = smb_dir_open, 44 .open = smb_dir_open,
44}; 45};
45 46
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dbf6548bbf06..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -28,8 +28,9 @@
28#include "proto.h" 28#include "proto.h"
29 29
30static int 30static int
31smb_fsync(struct file *file, struct dentry * dentry, int datasync) 31smb_fsync(struct file *file, int datasync)
32{ 32{
33 struct dentry *dentry = file->f_path.dentry;
33 struct smb_sb_info *server = server_from_dentry(dentry); 34 struct smb_sb_info *server = server_from_dentry(dentry);
34 int result; 35 int result;
35 36
@@ -437,7 +438,7 @@ const struct file_operations smb_file_operations =
437 .aio_read = smb_file_aio_read, 438 .aio_read = smb_file_aio_read,
438 .write = do_sync_write, 439 .write = do_sync_write,
439 .aio_write = smb_file_aio_write, 440 .aio_write = smb_file_aio_write,
440 .ioctl = smb_ioctl, 441 .unlocked_ioctl = smb_ioctl,
441 .mmap = smb_file_mmap, 442 .mmap = smb_file_mmap,
442 .open = smb_file_open, 443 .open = smb_file_open,
443 .release = smb_file_release, 444 .release = smb_file_release,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index dfa1d67f8fca..9551cb6f7fe4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -714,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
714 error = server->ops->truncate(inode, attr->ia_size); 714 error = server->ops->truncate(inode, attr->ia_size);
715 if (error) 715 if (error)
716 goto out; 716 goto out;
717 error = vmtruncate(inode, attr->ia_size); 717 error = simple_setsize(inode, attr->ia_size);
718 if (error) 718 if (error)
719 goto out; 719 goto out;
720 refresh = 1; 720 refresh = 1;
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/smp_lock.h>
16#include <linux/net.h> 17#include <linux/net.h>
17 18
18#include <linux/smb_fs.h> 19#include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
22 23
23#include "proto.h" 24#include "proto.h"
24 25
25int 26long
26smb_ioctl(struct inode *inode, struct file *filp, 27smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
27 unsigned int cmd, unsigned long arg)
28{ 28{
29 struct smb_sb_info *server = server_from_inode(inode); 29 struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
30 struct smb_conn_opt opt; 30 struct smb_conn_opt opt;
31 int result = -EINVAL; 31 int result = -EINVAL;
32 32
33 lock_kernel();
33 switch (cmd) { 34 switch (cmd) {
34 uid16_t uid16; 35 uid16_t uid16;
35 uid_t uid32; 36 uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
62 default: 63 default:
63 break; 64 break;
64 } 65 }
66 unlock_kernel();
65 67
66 return result; 68 return result;
67} 69}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
67extern const struct file_operations smb_file_operations; 67extern const struct file_operations smb_file_operations;
68extern const struct inode_operations smb_file_inode_operations; 68extern const struct inode_operations smb_file_inode_operations;
69/* ioctl.c */ 69/* ioctl.c */
70extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); 70extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
71/* smbiod.c */ 71/* smbiod.c */
72extern void smbiod_wake_up(void); 72extern void smbiod_wake_up(void);
73extern int smbiod_register_server(struct smb_sb_info *server); 73extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046b..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
19 18
20#include <asm/uaccess.h> 19#include <asm/uaccess.h>
21#include <asm/system.h> 20#include <asm/system.h>
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config SQUASHFS_XATTRS
30 bool "Squashfs XATTR support"
31 depends on SQUASHFS
32 default n
33 help
34 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by
36 the kernel or by users (see the attr(5) manual page).
37
38 If unsure, say N.
39
29config SQUASHFS_EMBEDDED 40config SQUASHFS_EMBEDDED
30 41
31 bool "Additional option for memory-constrained systems" 42 bool "Additional option for memory-constrained systems"
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
9
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/xattr.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
46#include "squashfs_fs_i.h" 47#include "squashfs_fs_i.h"
47#include "squashfs.h" 48#include "squashfs.h"
49#include "xattr.h"
48 50
49/* 51/*
50 * Initialise VFS inode with the base inode information common to all 52 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
111 int err, type, offset = SQUASHFS_INODE_OFFSET(ino); 113 int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
112 union squashfs_inode squashfs_ino; 114 union squashfs_inode squashfs_ino;
113 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; 115 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
116 int xattr_id = SQUASHFS_INVALID_XATTR;
114 117
115 TRACE("Entered squashfs_read_inode\n"); 118 TRACE("Entered squashfs_read_inode\n");
116 119
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
199 frag_offset = 0; 202 frag_offset = 0;
200 } 203 }
201 204
205 xattr_id = le32_to_cpu(sqsh_ino->xattr);
202 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 206 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
203 inode->i_size = le64_to_cpu(sqsh_ino->file_size); 207 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
208 inode->i_op = &squashfs_inode_ops;
204 inode->i_fop = &generic_ro_fops; 209 inode->i_fop = &generic_ro_fops;
205 inode->i_mode |= S_IFREG; 210 inode->i_mode |= S_IFREG;
206 inode->i_blocks = ((inode->i_size - 211 inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
251 if (err < 0) 256 if (err < 0)
252 goto failed_read; 257 goto failed_read;
253 258
259 xattr_id = le32_to_cpu(sqsh_ino->xattr);
254 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 260 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
255 inode->i_size = le32_to_cpu(sqsh_ino->file_size); 261 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
256 inode->i_op = &squashfs_dir_inode_ops; 262 inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
280 286
281 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 287 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
282 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); 288 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
283 inode->i_op = &page_symlink_inode_operations; 289 inode->i_op = &squashfs_symlink_inode_ops;
284 inode->i_data.a_ops = &squashfs_symlink_aops; 290 inode->i_data.a_ops = &squashfs_symlink_aops;
285 inode->i_mode |= S_IFLNK; 291 inode->i_mode |= S_IFLNK;
286 squashfs_i(inode)->start = block; 292 squashfs_i(inode)->start = block;
287 squashfs_i(inode)->offset = offset; 293 squashfs_i(inode)->offset = offset;
288 294
295 if (type == SQUASHFS_LSYMLINK_TYPE) {
296 __le32 xattr;
297
298 err = squashfs_read_metadata(sb, NULL, &block,
299 &offset, inode->i_size);
300 if (err < 0)
301 goto failed_read;
302 err = squashfs_read_metadata(sb, &xattr, &block,
303 &offset, sizeof(xattr));
304 if (err < 0)
305 goto failed_read;
306 xattr_id = le32_to_cpu(xattr);
307 }
308
289 TRACE("Symbolic link inode %x:%x, start_block %llx, offset " 309 TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
290 "%x\n", SQUASHFS_INODE_BLK(ino), offset, 310 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
291 block, offset); 311 block, offset);
292 break; 312 break;
293 } 313 }
294 case SQUASHFS_BLKDEV_TYPE: 314 case SQUASHFS_BLKDEV_TYPE:
295 case SQUASHFS_CHRDEV_TYPE: 315 case SQUASHFS_CHRDEV_TYPE: {
296 case SQUASHFS_LBLKDEV_TYPE:
297 case SQUASHFS_LCHRDEV_TYPE: {
298 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; 316 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
299 unsigned int rdev; 317 unsigned int rdev;
300 318
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
315 SQUASHFS_INODE_BLK(ino), offset, rdev); 333 SQUASHFS_INODE_BLK(ino), offset, rdev);
316 break; 334 break;
317 } 335 }
336 case SQUASHFS_LBLKDEV_TYPE:
337 case SQUASHFS_LCHRDEV_TYPE: {
338 struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
339 unsigned int rdev;
340
341 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
342 sizeof(*sqsh_ino));
343 if (err < 0)
344 goto failed_read;
345
346 if (type == SQUASHFS_LCHRDEV_TYPE)
347 inode->i_mode |= S_IFCHR;
348 else
349 inode->i_mode |= S_IFBLK;
350 xattr_id = le32_to_cpu(sqsh_ino->xattr);
351 inode->i_op = &squashfs_inode_ops;
352 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
353 rdev = le32_to_cpu(sqsh_ino->rdev);
354 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
355
356 TRACE("Device inode %x:%x, rdev %x\n",
357 SQUASHFS_INODE_BLK(ino), offset, rdev);
358 break;
359 }
318 case SQUASHFS_FIFO_TYPE: 360 case SQUASHFS_FIFO_TYPE:
319 case SQUASHFS_SOCKET_TYPE: 361 case SQUASHFS_SOCKET_TYPE: {
320 case SQUASHFS_LFIFO_TYPE:
321 case SQUASHFS_LSOCKET_TYPE: {
322 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; 362 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
323 363
324 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, 364 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
334 init_special_inode(inode, inode->i_mode, 0); 374 init_special_inode(inode, inode->i_mode, 0);
335 break; 375 break;
336 } 376 }
377 case SQUASHFS_LFIFO_TYPE:
378 case SQUASHFS_LSOCKET_TYPE: {
379 struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
380
381 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
382 sizeof(*sqsh_ino));
383 if (err < 0)
384 goto failed_read;
385
386 if (type == SQUASHFS_LFIFO_TYPE)
387 inode->i_mode |= S_IFIFO;
388 else
389 inode->i_mode |= S_IFSOCK;
390 xattr_id = le32_to_cpu(sqsh_ino->xattr);
391 inode->i_op = &squashfs_inode_ops;
392 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
393 init_special_inode(inode, inode->i_mode, 0);
394 break;
395 }
337 default: 396 default:
338 ERROR("Unknown inode type %d in squashfs_iget!\n", type); 397 ERROR("Unknown inode type %d in squashfs_iget!\n", type);
339 return -EINVAL; 398 return -EINVAL;
340 } 399 }
341 400
401 if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
402 err = squashfs_xattr_lookup(sb, xattr_id,
403 &squashfs_i(inode)->xattr_count,
404 &squashfs_i(inode)->xattr_size,
405 &squashfs_i(inode)->xattr);
406 if (err < 0)
407 goto failed_read;
408 inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
409 + 1;
410 } else
411 squashfs_i(inode)->xattr_count = 0;
412
342 return 0; 413 return 0;
343 414
344failed_read: 415failed_read:
345 ERROR("Unable to read inode 0x%llx\n", ino); 416 ERROR("Unable to read inode 0x%llx\n", ino);
346 return err; 417 return err;
347} 418}
419
420
421const struct inode_operations squashfs_inode_ops = {
422 .getxattr = generic_getxattr,
423 .listxattr = squashfs_listxattr
424};
425
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/xattr.h>
60 61
61#include "squashfs_fs.h" 62#include "squashfs_fs.h"
62#include "squashfs_fs_sb.h" 63#include "squashfs_fs_sb.h"
63#include "squashfs_fs_i.h" 64#include "squashfs_fs_i.h"
64#include "squashfs.h" 65#include "squashfs.h"
66#include "xattr.h"
65 67
66/* 68/*
67 * Lookup name in the directory index, returning the location of the metadata 69 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
237 239
238 240
239const struct inode_operations squashfs_dir_inode_ops = { 241const struct inode_operations squashfs_dir_inode_ops = {
240 .lookup = squashfs_lookup 242 .lookup = squashfs_lookup,
243 .getxattr = generic_getxattr,
244 .listxattr = squashfs_listxattr
241}; 245};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
73 unsigned int); 73 unsigned int);
74extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
75 75
76/* xattr.c */
77extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
78
76/* 79/*
77 * Inodes, files and decompressor operations 80 * Inodes, files, decompressor and xattr operations
78 */ 81 */
79 82
80/* dir.c */ 83/* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
86/* file.c */ 89/* file.c */
87extern const struct address_space_operations squashfs_aops; 90extern const struct address_space_operations squashfs_aops;
88 91
92/* inode.c */
93extern const struct inode_operations squashfs_inode_ops;
94
89/* namei.c */ 95/* namei.c */
90extern const struct inode_operations squashfs_dir_inode_ops; 96extern const struct inode_operations squashfs_dir_inode_ops;
91 97
92/* symlink.c */ 98/* symlink.c */
93extern const struct address_space_operations squashfs_symlink_aops; 99extern const struct address_space_operations squashfs_symlink_aops;
100extern const struct inode_operations squashfs_symlink_inode_ops;
101
102/* xattr.c */
103extern const struct xattr_handler *squashfs_xattr_handlers[];
94 104
95/* zlib_wrapper.c */ 105/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 106extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
46#define SQUASHFS_NAME_LEN 256 46#define SQUASHFS_NAME_LEN 256
47 47
48#define SQUASHFS_INVALID_FRAG (0xffffffffU) 48#define SQUASHFS_INVALID_FRAG (0xffffffffU)
49#define SQUASHFS_INVALID_XATTR (0xffffffffU)
49#define SQUASHFS_INVALID_BLK (-1LL) 50#define SQUASHFS_INVALID_BLK (-1LL)
50 51
51/* Filesystem flags */ 52/* Filesystem flags */
@@ -96,6 +97,13 @@
96#define SQUASHFS_LFIFO_TYPE 13 97#define SQUASHFS_LFIFO_TYPE 13
97#define SQUASHFS_LSOCKET_TYPE 14 98#define SQUASHFS_LSOCKET_TYPE 14
98 99
100/* Xattr types */
101#define SQUASHFS_XATTR_USER 0
102#define SQUASHFS_XATTR_TRUSTED 1
103#define SQUASHFS_XATTR_SECURITY 2
104#define SQUASHFS_XATTR_VALUE_OOL 256
105#define SQUASHFS_XATTR_PREFIX_MASK 0xff
106
99/* Flag whether block is compressed or uncompressed, bit is set if block is 107/* Flag whether block is compressed or uncompressed, bit is set if block is
100 * uncompressed */ 108 * uncompressed */
101#define SQUASHFS_COMPRESSED_BIT (1 << 15) 109#define SQUASHFS_COMPRESSED_BIT (1 << 15)
@@ -174,6 +182,24 @@
174 182
175#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ 183#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
176 sizeof(u64)) 184 sizeof(u64))
185/* xattr id lookup table defines */
186#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
187
188#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
189 SQUASHFS_METADATA_SIZE)
190
191#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \
192 SQUASHFS_METADATA_SIZE)
193
194#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \
195 SQUASHFS_METADATA_SIZE - 1) / \
196 SQUASHFS_METADATA_SIZE)
197
198#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\
199 sizeof(u64))
200#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16))
201
202#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff))
177 203
178/* cached data constants for filesystem */ 204/* cached data constants for filesystem */
179#define SQUASHFS_CACHED_BLKS 8 205#define SQUASHFS_CACHED_BLKS 8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
228 __le64 root_inode; 254 __le64 root_inode;
229 __le64 bytes_used; 255 __le64 bytes_used;
230 __le64 id_table_start; 256 __le64 id_table_start;
231 __le64 xattr_table_start; 257 __le64 xattr_id_table_start;
232 __le64 inode_table_start; 258 __le64 inode_table_start;
233 __le64 directory_table_start; 259 __le64 directory_table_start;
234 __le64 fragment_table_start; 260 __le64 fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
261 __le32 nlink; 287 __le32 nlink;
262}; 288};
263 289
290struct squashfs_lipc_inode {
291 __le16 inode_type;
292 __le16 mode;
293 __le16 uid;
294 __le16 guid;
295 __le32 mtime;
296 __le32 inode_number;
297 __le32 nlink;
298 __le32 xattr;
299};
300
264struct squashfs_dev_inode { 301struct squashfs_dev_inode {
265 __le16 inode_type; 302 __le16 inode_type;
266 __le16 mode; 303 __le16 mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
272 __le32 rdev; 309 __le32 rdev;
273}; 310};
274 311
312struct squashfs_ldev_inode {
313 __le16 inode_type;
314 __le16 mode;
315 __le16 uid;
316 __le16 guid;
317 __le32 mtime;
318 __le32 inode_number;
319 __le32 nlink;
320 __le32 rdev;
321 __le32 xattr;
322};
323
275struct squashfs_symlink_inode { 324struct squashfs_symlink_inode {
276 __le16 inode_type; 325 __le16 inode_type;
277 __le16 mode; 326 __le16 mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
349union squashfs_inode { 398union squashfs_inode {
350 struct squashfs_base_inode base; 399 struct squashfs_base_inode base;
351 struct squashfs_dev_inode dev; 400 struct squashfs_dev_inode dev;
401 struct squashfs_ldev_inode ldev;
352 struct squashfs_symlink_inode symlink; 402 struct squashfs_symlink_inode symlink;
353 struct squashfs_reg_inode reg; 403 struct squashfs_reg_inode reg;
354 struct squashfs_lreg_inode lreg; 404 struct squashfs_lreg_inode lreg;
355 struct squashfs_dir_inode dir; 405 struct squashfs_dir_inode dir;
356 struct squashfs_ldir_inode ldir; 406 struct squashfs_ldir_inode ldir;
357 struct squashfs_ipc_inode ipc; 407 struct squashfs_ipc_inode ipc;
408 struct squashfs_lipc_inode lipc;
358}; 409};
359 410
360struct squashfs_dir_entry { 411struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
377 unsigned int unused; 428 unsigned int unused;
378}; 429};
379 430
431struct squashfs_xattr_entry {
432 __le16 type;
433 __le16 size;
434 char data[0];
435};
436
437struct squashfs_xattr_val {
438 __le32 vsize;
439 char value[0];
440};
441
442struct squashfs_xattr_id {
443 __le64 xattr;
444 __le32 count;
445 __le32 size;
446};
447
448struct squashfs_xattr_id_table {
449 __le64 xattr_table_start;
450 __le32 xattr_ids;
451 __le32 unused;
452};
453
380#endif 454#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
26struct squashfs_inode_info { 26struct squashfs_inode_info {
27 u64 start; 27 u64 start;
28 int offset; 28 int offset;
29 u64 xattr;
30 unsigned int xattr_size;
31 int xattr_count;
29 union { 32 union {
30 struct { 33 struct {
31 u64 fragment_block; 34 u64 fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
61 int next_meta_index; 61 int next_meta_index;
62 __le64 *id_table; 62 __le64 *id_table;
63 __le64 *fragment_index; 63 __le64 *fragment_index;
64 __le64 *xattr_id_table;
64 struct mutex read_data_mutex; 65 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 66 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 67 struct meta_index *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
68 __le64 *inode_lookup_table; 69 __le64 *inode_lookup_table;
69 u64 inode_table; 70 u64 inode_table;
70 u64 directory_table; 71 u64 directory_table;
72 u64 xattr_table;
71 unsigned int block_size; 73 unsigned int block_size;
72 unsigned short block_log; 74 unsigned short block_log;
73 long long bytes_used; 75 long long bytes_used;
74 unsigned int inodes; 76 unsigned int inodes;
77 int xattr_ids;
75}; 78};
76#endif 79#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/magic.h> 38#include <linux/magic.h>
39#include <linux/xattr.h>
39 40
40#include "squashfs_fs.h" 41#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 42#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h" 43#include "squashfs_fs_i.h"
43#include "squashfs.h" 44#include "squashfs.h"
44#include "decompressor.h" 45#include "decompressor.h"
46#include "xattr.h"
45 47
46static struct file_system_type squashfs_fs_type; 48static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 49static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
82 long long root_inode; 84 long long root_inode;
83 unsigned short flags; 85 unsigned short flags;
84 unsigned int fragments; 86 unsigned int fragments;
85 u64 lookup_table_start; 87 u64 lookup_table_start, xattr_id_table_start;
86 int err; 88 int err;
87 89
88 TRACE("Entered squashfs_fill_superblock\n"); 90 TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
139 if (msblk->decompressor == NULL) 141 if (msblk->decompressor == NULL)
140 goto failed_mount; 142 goto failed_mount;
141 143
142 /*
143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored.
145 */
146 if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
147 ERROR("Xattrs in filesystem, these will be ignored\n");
148
149 /* Check the filesystem does not extend beyond the end of the 144 /* Check the filesystem does not extend beyond the end of the
150 block device */ 145 block device */
151 msblk->bytes_used = le64_to_cpu(sblk->bytes_used); 146 msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
253allocate_lookup_table: 248allocate_lookup_table:
254 lookup_table_start = le64_to_cpu(sblk->lookup_table_start); 249 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
255 if (lookup_table_start == SQUASHFS_INVALID_BLK) 250 if (lookup_table_start == SQUASHFS_INVALID_BLK)
256 goto allocate_root; 251 goto allocate_xattr_table;
257 252
258 /* Allocate and read inode lookup table */ 253 /* Allocate and read inode lookup table */
259 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, 254 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
266 261
267 sb->s_export_op = &squashfs_export_ops; 262 sb->s_export_op = &squashfs_export_ops;
268 263
264allocate_xattr_table:
265 sb->s_xattr = squashfs_xattr_handlers;
266 xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
267 if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
268 goto allocate_root;
269
270 /* Allocate and read xattr id lookup table */
271 msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
272 xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
273 if (IS_ERR(msblk->xattr_id_table)) {
274 err = PTR_ERR(msblk->xattr_id_table);
275 msblk->xattr_id_table = NULL;
276 if (err != -ENOTSUPP)
277 goto failed_mount;
278 }
269allocate_root: 279allocate_root:
270 root = new_inode(sb); 280 root = new_inode(sb);
271 if (!root) { 281 if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
301 kfree(msblk->inode_lookup_table); 311 kfree(msblk->inode_lookup_table);
302 kfree(msblk->fragment_index); 312 kfree(msblk->fragment_index);
303 kfree(msblk->id_table); 313 kfree(msblk->id_table);
314 kfree(msblk->xattr_id_table);
304 kfree(sb->s_fs_info); 315 kfree(sb->s_fs_info);
305 sb->s_fs_info = NULL; 316 sb->s_fs_info = NULL;
306 kfree(sblk); 317 kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
355 kfree(sbi->fragment_index); 366 kfree(sbi->fragment_index);
356 kfree(sbi->meta_index); 367 kfree(sbi->meta_index);
357 kfree(sbi->inode_lookup_table); 368 kfree(sbi->inode_lookup_table);
369 kfree(sbi->xattr_id_table);
358 kfree(sb->s_fs_info); 370 kfree(sb->s_fs_info);
359 sb->s_fs_info = NULL; 371 sb->s_fs_info = NULL;
360 } 372 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/string.h> 36#include <linux/string.h>
37#include <linux/pagemap.h> 37#include <linux/pagemap.h>
38#include <linux/xattr.h>
38 39
39#include "squashfs_fs.h" 40#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
42#include "squashfs.h" 43#include "squashfs.h"
44#include "xattr.h"
43 45
44static int squashfs_symlink_readpage(struct file *file, struct page *page) 46static int squashfs_symlink_readpage(struct file *file, struct page *page)
45{ 47{
@@ -114,3 +116,12 @@ error_out:
114const struct address_space_operations squashfs_symlink_aops = { 116const struct address_space_operations squashfs_symlink_aops = {
115 .readpage = squashfs_symlink_readpage 117 .readpage = squashfs_symlink_readpage
116}; 118};
119
120const struct inode_operations squashfs_symlink_inode_ops = {
121 .readlink = generic_readlink,
122 .follow_link = page_follow_link_light,
123 .put_link = page_put_link,
124 .getxattr = generic_getxattr,
125 .listxattr = squashfs_listxattr
126};
127
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24#include <linux/init.h>
25#include <linux/module.h>
26#include <linux/string.h>
27#include <linux/fs.h>
28#include <linux/vfs.h>
29#include <linux/xattr.h>
30#include <linux/slab.h>
31
32#include "squashfs_fs.h"
33#include "squashfs_fs_sb.h"
34#include "squashfs_fs_i.h"
35#include "squashfs.h"
36
37static const struct xattr_handler *squashfs_xattr_handler(int);
38
39ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
40 size_t buffer_size)
41{
42 struct inode *inode = d->d_inode;
43 struct super_block *sb = inode->i_sb;
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
46 + msblk->xattr_table;
47 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
48 int count = squashfs_i(inode)->xattr_count;
49 size_t rest = buffer_size;
50 int err;
51
52 /* check that the file system has xattrs */
53 if (msblk->xattr_id_table == NULL)
54 return -EOPNOTSUPP;
55
56 /* loop reading each xattr name */
57 while (count--) {
58 struct squashfs_xattr_entry entry;
59 struct squashfs_xattr_val val;
60 const struct xattr_handler *handler;
61 int name_size, prefix_size = 0;
62
63 err = squashfs_read_metadata(sb, &entry, &start, &offset,
64 sizeof(entry));
65 if (err < 0)
66 goto failed;
67
68 name_size = le16_to_cpu(entry.size);
69 handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
70 if (handler)
71 prefix_size = handler->list(d, buffer, rest, NULL,
72 name_size, handler->flags);
73 if (prefix_size) {
74 if (buffer) {
75 if (prefix_size + name_size + 1 > rest) {
76 err = -ERANGE;
77 goto failed;
78 }
79 buffer += prefix_size;
80 }
81 err = squashfs_read_metadata(sb, buffer, &start,
82 &offset, name_size);
83 if (err < 0)
84 goto failed;
85 if (buffer) {
86 buffer[name_size] = '\0';
87 buffer += name_size + 1;
88 }
89 rest -= prefix_size + name_size + 1;
90 } else {
91 /* no handler or insuffficient privileges, so skip */
92 err = squashfs_read_metadata(sb, NULL, &start,
93 &offset, name_size);
94 if (err < 0)
95 goto failed;
96 }
97
98
99 /* skip remaining xattr entry */
100 err = squashfs_read_metadata(sb, &val, &start, &offset,
101 sizeof(val));
102 if (err < 0)
103 goto failed;
104
105 err = squashfs_read_metadata(sb, NULL, &start, &offset,
106 le32_to_cpu(val.vsize));
107 if (err < 0)
108 goto failed;
109 }
110 err = buffer_size - rest;
111
112failed:
113 return err;
114}
115
116
117static int squashfs_xattr_get(struct inode *inode, int name_index,
118 const char *name, void *buffer, size_t buffer_size)
119{
120 struct super_block *sb = inode->i_sb;
121 struct squashfs_sb_info *msblk = sb->s_fs_info;
122 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
123 + msblk->xattr_table;
124 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
125 int count = squashfs_i(inode)->xattr_count;
126 int name_len = strlen(name);
127 int err, vsize;
128 char *target = kmalloc(name_len, GFP_KERNEL);
129
130 if (target == NULL)
131 return -ENOMEM;
132
133 /* loop reading each xattr name */
134 for (; count; count--) {
135 struct squashfs_xattr_entry entry;
136 struct squashfs_xattr_val val;
137 int type, prefix, name_size;
138
139 err = squashfs_read_metadata(sb, &entry, &start, &offset,
140 sizeof(entry));
141 if (err < 0)
142 goto failed;
143
144 name_size = le16_to_cpu(entry.size);
145 type = le16_to_cpu(entry.type);
146 prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
147
148 if (prefix == name_index && name_size == name_len)
149 err = squashfs_read_metadata(sb, target, &start,
150 &offset, name_size);
151 else
152 err = squashfs_read_metadata(sb, NULL, &start,
153 &offset, name_size);
154 if (err < 0)
155 goto failed;
156
157 if (prefix == name_index && name_size == name_len &&
158 strncmp(target, name, name_size) == 0) {
159 /* found xattr */
160 if (type & SQUASHFS_XATTR_VALUE_OOL) {
161 __le64 xattr;
162 /* val is a reference to the real location */
163 err = squashfs_read_metadata(sb, &val, &start,
164 &offset, sizeof(val));
165 if (err < 0)
166 goto failed;
167 err = squashfs_read_metadata(sb, &xattr, &start,
168 &offset, sizeof(xattr));
169 if (err < 0)
170 goto failed;
171 xattr = le64_to_cpu(xattr);
172 start = SQUASHFS_XATTR_BLK(xattr) +
173 msblk->xattr_table;
174 offset = SQUASHFS_XATTR_OFFSET(xattr);
175 }
176 /* read xattr value */
177 err = squashfs_read_metadata(sb, &val, &start, &offset,
178 sizeof(val));
179 if (err < 0)
180 goto failed;
181
182 vsize = le32_to_cpu(val.vsize);
183 if (buffer) {
184 if (vsize > buffer_size) {
185 err = -ERANGE;
186 goto failed;
187 }
188 err = squashfs_read_metadata(sb, buffer, &start,
189 &offset, vsize);
190 if (err < 0)
191 goto failed;
192 }
193 break;
194 }
195
196 /* no match, skip remaining xattr entry */
197 err = squashfs_read_metadata(sb, &val, &start, &offset,
198 sizeof(val));
199 if (err < 0)
200 goto failed;
201 err = squashfs_read_metadata(sb, NULL, &start, &offset,
202 le32_to_cpu(val.vsize));
203 if (err < 0)
204 goto failed;
205 }
206 err = count ? vsize : -ENODATA;
207
208failed:
209 kfree(target);
210 return err;
211}
212
213
214/*
215 * User namespace support
216 */
217static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
218 const char *name, size_t name_len, int type)
219{
220 if (list && XATTR_USER_PREFIX_LEN <= list_size)
221 memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
222 return XATTR_USER_PREFIX_LEN;
223}
224
225static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
226 size_t size, int type)
227{
228 if (name[0] == '\0')
229 return -EINVAL;
230
231 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
232 buffer, size);
233}
234
235static const struct xattr_handler squashfs_xattr_user_handler = {
236 .prefix = XATTR_USER_PREFIX,
237 .list = squashfs_user_list,
238 .get = squashfs_user_get
239};
240
241/*
242 * Trusted namespace support
243 */
244static size_t squashfs_trusted_list(struct dentry *d, char *list,
245 size_t list_size, const char *name, size_t name_len, int type)
246{
247 if (!capable(CAP_SYS_ADMIN))
248 return 0;
249
250 if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
251 memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
252 return XATTR_TRUSTED_PREFIX_LEN;
253}
254
255static int squashfs_trusted_get(struct dentry *d, const char *name,
256 void *buffer, size_t size, int type)
257{
258 if (name[0] == '\0')
259 return -EINVAL;
260
261 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
262 buffer, size);
263}
264
265static const struct xattr_handler squashfs_xattr_trusted_handler = {
266 .prefix = XATTR_TRUSTED_PREFIX,
267 .list = squashfs_trusted_list,
268 .get = squashfs_trusted_get
269};
270
271/*
272 * Security namespace support
273 */
274static size_t squashfs_security_list(struct dentry *d, char *list,
275 size_t list_size, const char *name, size_t name_len, int type)
276{
277 if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
278 memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
279 return XATTR_SECURITY_PREFIX_LEN;
280}
281
282static int squashfs_security_get(struct dentry *d, const char *name,
283 void *buffer, size_t size, int type)
284{
285 if (name[0] == '\0')
286 return -EINVAL;
287
288 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
289 buffer, size);
290}
291
292static const struct xattr_handler squashfs_xattr_security_handler = {
293 .prefix = XATTR_SECURITY_PREFIX,
294 .list = squashfs_security_list,
295 .get = squashfs_security_get
296};
297
298static inline const struct xattr_handler *squashfs_xattr_handler(int type)
299{
300 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
301 /* ignore unrecognised type */
302 return NULL;
303
304 switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
305 case SQUASHFS_XATTR_USER:
306 return &squashfs_xattr_user_handler;
307 case SQUASHFS_XATTR_TRUSTED:
308 return &squashfs_xattr_trusted_handler;
309 case SQUASHFS_XATTR_SECURITY:
310 return &squashfs_xattr_security_handler;
311 default:
312 /* ignore unrecognised type */
313 return NULL;
314 }
315}
316
317const struct xattr_handler *squashfs_xattr_handlers[] = {
318 &squashfs_xattr_user_handler,
319 &squashfs_xattr_trusted_handler,
320 &squashfs_xattr_security_handler,
321 NULL
322};
323
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr.h
22 */
23
24#ifdef CONFIG_SQUASHFS_XATTRS
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
28 int *, unsigned long long *);
29#else
30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
31 u64 start, u64 *xattr_table_start, int *xattr_ids)
32{
33 ERROR("Xattrs in filesystem, these will be ignored\n");
34 return ERR_PTR(-ENOTSUPP);
35}
36
37static inline int squashfs_xattr_lookup(struct super_block *sb,
38 unsigned int index, int *count, int *size,
39 unsigned long long *xattr)
40{
41 return 0;
42}
43#define squashfs_listxattr NULL
44#define generic_getxattr NULL
45#define squashfs_xattr_handlers NULL
46#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24/*
25 * This file implements code to map the 32-bit xattr id stored in the inode
26 * into the on disk location of the xattr data.
27 */
28
29#include <linux/fs.h>
30#include <linux/vfs.h>
31#include <linux/slab.h>
32
33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h"
37
38/*
39 * Map xattr id using the xattr id look up table
40 */
41int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
42 int *count, unsigned int *size, unsigned long long *xattr)
43{
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 int block = SQUASHFS_XATTR_BLOCK(index);
46 int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
47 u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
48 struct squashfs_xattr_id id;
49 int err;
50
51 err = squashfs_read_metadata(sb, &id, &start_block, &offset,
52 sizeof(id));
53 if (err < 0)
54 return err;
55
56 *xattr = le64_to_cpu(id.xattr);
57 *size = le32_to_cpu(id.size);
58 *count = le32_to_cpu(id.count);
59 return 0;
60}
61
62
63/*
64 * Read uncompressed xattr id lookup table indexes from disk into memory
65 */
66__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
67 u64 *xattr_table_start, int *xattr_ids)
68{
69 unsigned int len;
70 __le64 *xid_table;
71 struct squashfs_xattr_id_table id_table;
72 int err;
73
74 err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
75 if (err < 0) {
76 ERROR("unable to read xattr id table\n");
77 return ERR_PTR(err);
78 }
79 *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
80 *xattr_ids = le32_to_cpu(id_table.xattr_ids);
81 len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
82
83 TRACE("In read_xattr_index_table, length %d\n", len);
84
85 /* Allocate xattr id lookup table indexes */
86 xid_table = kmalloc(len, GFP_KERNEL);
87 if (xid_table == NULL) {
88 ERROR("Failed to allocate xattr id index table\n");
89 return ERR_PTR(-ENOMEM);
90 }
91
92 err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
93 if (err < 0) {
94 ERROR("unable to read xattr id index table\n");
95 kfree(xid_table);
96 return ERR_PTR(err);
97 }
98
99 return xid_table;
100}
diff --git a/fs/super.c b/fs/super.c
index 69688b15f1fa..5c35bc7a499e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,7 +24,6 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/acct.h> 25#include <linux/acct.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/quotaops.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
29#include <linux/security.h> 28#include <linux/security.h>
30#include <linux/writeback.h> /* for the emergency remount stuff */ 29#include <linux/writeback.h> /* for the emergency remount stuff */
@@ -94,8 +93,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
94 init_rwsem(&s->s_dquot.dqptr_sem); 93 init_rwsem(&s->s_dquot.dqptr_sem);
95 init_waitqueue_head(&s->s_wait_unfrozen); 94 init_waitqueue_head(&s->s_wait_unfrozen);
96 s->s_maxbytes = MAX_NON_LFS; 95 s->s_maxbytes = MAX_NON_LFS;
97 s->dq_op = sb_dquot_ops;
98 s->s_qcop = sb_quotactl_ops;
99 s->s_op = &default_op; 96 s->s_op = &default_op;
100 s->s_time_gran = 1000000000; 97 s->s_time_gran = 1000000000;
101 } 98 }
@@ -160,7 +157,6 @@ void deactivate_locked_super(struct super_block *s)
160{ 157{
161 struct file_system_type *fs = s->s_type; 158 struct file_system_type *fs = s->s_type;
162 if (atomic_dec_and_test(&s->s_active)) { 159 if (atomic_dec_and_test(&s->s_active)) {
163 vfs_dq_off(s, 0);
164 fs->kill_sb(s); 160 fs->kill_sb(s);
165 put_filesystem(fs); 161 put_filesystem(fs);
166 put_super(s); 162 put_super(s);
@@ -524,7 +520,7 @@ rescan:
524int do_remount_sb(struct super_block *sb, int flags, void *data, int force) 520int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
525{ 521{
526 int retval; 522 int retval;
527 int remount_rw, remount_ro; 523 int remount_ro;
528 524
529 if (sb->s_frozen != SB_UNFROZEN) 525 if (sb->s_frozen != SB_UNFROZEN)
530 return -EBUSY; 526 return -EBUSY;
@@ -540,7 +536,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
540 sync_filesystem(sb); 536 sync_filesystem(sb);
541 537
542 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 538 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
543 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
544 539
545 /* If we are remounting RDONLY and current sb is read/write, 540 /* If we are remounting RDONLY and current sb is read/write,
546 make sure there are no rw files opened */ 541 make sure there are no rw files opened */
@@ -549,9 +544,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
549 mark_files_ro(sb); 544 mark_files_ro(sb);
550 else if (!fs_may_remount_ro(sb)) 545 else if (!fs_may_remount_ro(sb))
551 return -EBUSY; 546 return -EBUSY;
552 retval = vfs_dq_off(sb, 1);
553 if (retval < 0 && retval != -ENOSYS)
554 return -EBUSY;
555 } 547 }
556 548
557 if (sb->s_op->remount_fs) { 549 if (sb->s_op->remount_fs) {
@@ -560,8 +552,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
560 return retval; 552 return retval;
561 } 553 }
562 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 554 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
563 if (remount_rw) 555
564 vfs_dq_quota_on_remount(sb);
565 /* 556 /*
566 * Some filesystems modify their metadata via some other path than the 557 * Some filesystems modify their metadata via some other path than the
567 * bdev buffer cache (eg. use a private mapping, or directories in 558 * bdev buffer cache (eg. use a private mapping, or directories in
@@ -946,8 +937,8 @@ out:
946EXPORT_SYMBOL_GPL(vfs_kern_mount); 937EXPORT_SYMBOL_GPL(vfs_kern_mount);
947 938
948/** 939/**
949 * freeze_super -- lock the filesystem and force it into a consistent state 940 * freeze_super - lock the filesystem and force it into a consistent state
950 * @super: the super to lock 941 * @sb: the super to lock
951 * 942 *
952 * Syncs the super to make sure the filesystem is consistent and calls the fs's 943 * Syncs the super to make sure the filesystem is consistent and calls the fs's
953 * freeze_fs. Subsequent calls to this without first thawing the fs will return 944 * freeze_fs. Subsequent calls to this without first thawing the fs will return
diff --git a/fs/sync.c b/fs/sync.c
index 5a537ccd2e85..15aa6f03b2da 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -130,12 +130,10 @@ void emergency_sync(void)
130 130
131/* 131/*
132 * Generic function to fsync a file. 132 * Generic function to fsync a file.
133 *
134 * filp may be NULL if called via the msync of a vma.
135 */ 133 */
136int file_fsync(struct file *filp, struct dentry *dentry, int datasync) 134int file_fsync(struct file *filp, int datasync)
137{ 135{
138 struct inode * inode = dentry->d_inode; 136 struct inode *inode = filp->f_mapping->host;
139 struct super_block * sb; 137 struct super_block * sb;
140 int ret, err; 138 int ret, err;
141 139
@@ -183,7 +181,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
183 * livelocks in fsync_buffers_list(). 181 * livelocks in fsync_buffers_list().
184 */ 182 */
185 mutex_lock(&mapping->host->i_mutex); 183 mutex_lock(&mapping->host->i_mutex);
186 err = file->f_op->fsync(file, file->f_path.dentry, datasync); 184 err = file->f_op->fsync(file, datasync);
187 if (!ret) 185 if (!ret)
188 ret = err; 186 ret = err;
189 mutex_unlock(&mapping->host->i_mutex); 187 mutex_unlock(&mapping->host->i_mutex);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index bbd77e95cf7f..bde1a4c3679a 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,13 +117,11 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
117 if (error) 117 if (error)
118 goto out; 118 goto out;
119 119
120 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ 120 /* this ignores size changes */
121 121 generic_setattr(inode, iattr);
122 error = inode_setattr(inode, iattr);
123 if (error)
124 goto out;
125 122
126 error = sysfs_sd_setattr(sd, iattr); 123 error = sysfs_sd_setattr(sd, iattr);
124
127out: 125out:
128 mutex_unlock(&sysfs_mutex); 126 mutex_unlock(&sysfs_mutex);
129 return error; 127 return error;
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 1dabed286b4c..79941e4964a4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
24 .llseek = generic_file_llseek, 24 .llseek = generic_file_llseek,
25 .read = generic_read_dir, 25 .read = generic_read_dir,
26 .readdir = sysv_readdir, 26 .readdir = sysv_readdir,
27 .fsync = simple_fsync, 27 .fsync = generic_file_fsync,
28}; 28};
29 29
30static inline void dir_put_page(struct page *page) 30static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..750cc22349bd 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
26 .write = do_sync_write, 26 .write = do_sync_write,
27 .aio_write = generic_file_aio_write, 27 .aio_write = generic_file_aio_write,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .splice_read = generic_file_splice_read, 30 .splice_read = generic_file_splice_read,
31}; 31};
32 32
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..d4a5380b5669 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
43 * then attach current time stamp. 43 * then attach current time stamp.
44 * But if the filesystem was marked clean, keep it clean. 44 * But if the filesystem was marked clean, keep it clean.
45 */ 45 */
46 sb->s_dirt = 0;
46 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); 47 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
47 if (sbi->s_type == FSTYPE_SYSV4) { 48 if (sbi->s_type == FSTYPE_SYSV4) {
48 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) 49 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf72b807..12f445cee9f7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -967,12 +967,15 @@ static int do_writepage(struct page *page, int len)
967 * the page locked, and it locks @ui_mutex. However, write-back does take inode 967 * the page locked, and it locks @ui_mutex. However, write-back does take inode
968 * @i_mutex, which means other VFS operations may be run on this inode at the 968 * @i_mutex, which means other VFS operations may be run on this inode at the
969 * same time. And the problematic one is truncation to smaller size, from where 969 * same time. And the problematic one is truncation to smaller size, from where
970 * we have to call 'vmtruncate()', which first changes @inode->i_size, then 970 * we have to call 'simple_setsize()', which first changes @inode->i_size, then
971 * drops the truncated pages. And while dropping the pages, it takes the page 971 * drops the truncated pages. And while dropping the pages, it takes the page
972 * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with 972 * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This 973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
974 * means that @inode->i_size is changed while @ui_mutex is unlocked. 974 * means that @inode->i_size is changed while @ui_mutex is unlocked.
975 * 975 *
976 * XXX: with the new truncate the above is not true anymore, the simple_setsize
977 * calls can be replaced with the individual components.
978 *
976 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond 979 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
977 * inode size. How do we do this if @inode->i_size may became smaller while we 980 * inode size. How do we do this if @inode->i_size may became smaller while we
978 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the 981 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1125,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
1125 budgeted = 0; 1128 budgeted = 0;
1126 } 1129 }
1127 1130
1128 err = vmtruncate(inode, new_size); 1131 err = simple_setsize(inode, new_size);
1129 if (err) 1132 if (err)
1130 goto out_budg; 1133 goto out_budg;
1131 1134
@@ -1214,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1214 1217
1215 if (attr->ia_valid & ATTR_SIZE) { 1218 if (attr->ia_valid & ATTR_SIZE) {
1216 dbg_gen("size %lld -> %lld", inode->i_size, new_size); 1219 dbg_gen("size %lld -> %lld", inode->i_size, new_size);
1217 err = vmtruncate(inode, new_size); 1220 err = simple_setsize(inode, new_size);
1218 if (err) 1221 if (err)
1219 goto out; 1222 goto out;
1220 } 1223 }
@@ -1223,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1223 if (attr->ia_valid & ATTR_SIZE) { 1226 if (attr->ia_valid & ATTR_SIZE) {
1224 /* Truncation changes inode [mc]time */ 1227 /* Truncation changes inode [mc]time */
1225 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1228 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1226 /* 'vmtruncate()' changed @i_size, update @ui_size */ 1229 /* 'simple_setsize()' changed @i_size, update @ui_size */
1227 ui->ui_size = inode->i_size; 1230 ui->ui_size = inode->i_size;
1228 } 1231 }
1229 1232
@@ -1304,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
1304 return NULL; 1307 return NULL;
1305} 1308}
1306 1309
1307int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1310int ubifs_fsync(struct file *file, int datasync)
1308{ 1311{
1309 struct inode *inode = dentry->d_inode; 1312 struct inode *inode = file->f_mapping->host;
1310 struct ubifs_info *c = inode->i_sb->s_fs_info; 1313 struct ubifs_info *c = inode->i_sb->s_fs_info;
1311 int err; 1314 int err;
1312 1315
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bd2542dad014..2eef553d50c8 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb {
379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses 379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot 380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
381 * make sure @inode->i_size is always changed under @ui_mutex, because it 381 * make sure @inode->i_size is always changed under @ui_mutex, because it
382 * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock 382 * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are 383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one 384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
385 * could consider to rework locking and base it on "shadow" fields. 385 * could consider to rework locking and base it on "shadow" fields.
@@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1678int ubifs_calc_dark(const struct ubifs_info *c, int spc); 1678int ubifs_calc_dark(const struct ubifs_info *c, int spc);
1679 1679
1680/* file.c */ 1680/* file.c */
1681int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); 1681int ubifs_fsync(struct file *file, int datasync);
1682int ubifs_setattr(struct dentry *dentry, struct iattr *attr); 1682int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
1683 1683
1684/* dir.c */ 1684/* dir.c */
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 9a9378b4eb5a..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
21 21
22#include "udfdecl.h" 22#include "udfdecl.h"
23 23
24#include <linux/quotaops.h>
25#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
26#include <linux/bitops.h> 25#include <linux/bitops.h>
27 26
@@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
159 udf_debug("byte=%2x\n", 158 udf_debug("byte=%2x\n",
160 ((char *)bh->b_data)[(bit + i) >> 3]); 159 ((char *)bh->b_data)[(bit + i) >> 3]);
161 } else { 160 } else {
162 if (inode)
163 dquot_free_block(inode, 1);
164 udf_add_free_space(sb, sbi->s_partition, 1); 161 udf_add_free_space(sb, sbi->s_partition, 1);
165 } 162 }
166 } 163 }
@@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
210 bit = block % (sb->s_blocksize << 3); 207 bit = block % (sb->s_blocksize << 3);
211 208
212 while (bit < (sb->s_blocksize << 3) && block_count > 0) { 209 while (bit < (sb->s_blocksize << 3) && block_count > 0) {
213 if (!udf_test_bit(bit, bh->b_data)) 210 if (!udf_clear_bit(bit, bh->b_data))
214 goto out; 211 goto out;
215 else if (dquot_prealloc_block(inode, 1))
216 goto out;
217 else if (!udf_clear_bit(bit, bh->b_data)) {
218 udf_debug("bit already cleared for block %d\n", bit);
219 dquot_free_block(inode, 1);
220 goto out;
221 }
222 block_count--; 212 block_count--;
223 alloc_count++; 213 alloc_count++;
224 bit++; 214 bit++;
@@ -338,20 +328,6 @@ search_back:
338 } 328 }
339 329
340got_block: 330got_block:
341
342 /*
343 * Check quota for allocation of this block.
344 */
345 if (inode) {
346 int ret = dquot_alloc_block(inode, 1);
347
348 if (ret) {
349 mutex_unlock(&sbi->s_alloc_mutex);
350 *err = ret;
351 return 0;
352 }
353 }
354
355 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - 331 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
356 (sizeof(struct spaceBitmapDesc) << 3); 332 (sizeof(struct spaceBitmapDesc) << 3);
357 333
@@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
401 } 377 }
402 378
403 iinfo = UDF_I(table); 379 iinfo = UDF_I(table);
404 /* We do this up front - There are some error conditions that
405 could occure, but.. oh well */
406 if (inode)
407 dquot_free_block(inode, count);
408 udf_add_free_space(sb, sbi->s_partition, count); 380 udf_add_free_space(sb, sbi->s_partition, count);
409 381
410 start = bloc->logicalBlockNum + offset; 382 start = bloc->logicalBlockNum + offset;
@@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
649 epos.offset -= adsize; 621 epos.offset -= adsize;
650 622
651 alloc_count = (elen >> sb->s_blocksize_bits); 623 alloc_count = (elen >> sb->s_blocksize_bits);
652 if (inode && dquot_prealloc_block(inode, 624 if (alloc_count > block_count) {
653 alloc_count > block_count ? block_count : alloc_count))
654 alloc_count = 0;
655 else if (alloc_count > block_count) {
656 alloc_count = block_count; 625 alloc_count = block_count;
657 eloc.logicalBlockNum += alloc_count; 626 eloc.logicalBlockNum += alloc_count;
658 elen -= (alloc_count << sb->s_blocksize_bits); 627 elen -= (alloc_count << sb->s_blocksize_bits);
@@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
752 newblock = goal_eloc.logicalBlockNum; 721 newblock = goal_eloc.logicalBlockNum;
753 goal_eloc.logicalBlockNum++; 722 goal_eloc.logicalBlockNum++;
754 goal_elen -= sb->s_blocksize; 723 goal_elen -= sb->s_blocksize;
755 if (inode) {
756 *err = dquot_alloc_block(inode, 1);
757 if (*err) {
758 brelse(goal_epos.bh);
759 mutex_unlock(&sbi->s_alloc_mutex);
760 return 0;
761 }
762 }
763 724
764 if (goal_elen) 725 if (goal_elen)
765 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); 726 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 3a84455c2a77..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
207 207
208/* readdir and lookup functions */ 208/* readdir and lookup functions */
209const struct file_operations udf_dir_operations = { 209const struct file_operations udf_dir_operations = {
210 .llseek = generic_file_llseek,
210 .read = generic_read_dir, 211 .read = generic_read_dir,
211 .readdir = udf_readdir, 212 .readdir = udf_readdir,
212 .unlocked_ioctl = udf_ioctl, 213 .unlocked_ioctl = udf_ioctl,
213 .fsync = simple_fsync, 214 .fsync = generic_file_fsync,
214}; 215};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index baae3a723946..94e06d6bddbd 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,6 @@
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h> 35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
38#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
39#include <linux/aio.h> 38#include <linux/aio.h>
40#include <linux/smp_lock.h> 39#include <linux/smp_lock.h>
@@ -219,39 +218,16 @@ const struct file_operations udf_file_operations = {
219 .read = do_sync_read, 218 .read = do_sync_read,
220 .aio_read = generic_file_aio_read, 219 .aio_read = generic_file_aio_read,
221 .unlocked_ioctl = udf_ioctl, 220 .unlocked_ioctl = udf_ioctl,
222 .open = dquot_file_open, 221 .open = generic_file_open,
223 .mmap = generic_file_mmap, 222 .mmap = generic_file_mmap,
224 .write = do_sync_write, 223 .write = do_sync_write,
225 .aio_write = udf_file_aio_write, 224 .aio_write = udf_file_aio_write,
226 .release = udf_release_file, 225 .release = udf_release_file,
227 .fsync = simple_fsync, 226 .fsync = generic_file_fsync,
228 .splice_read = generic_file_splice_read, 227 .splice_read = generic_file_splice_read,
229 .llseek = generic_file_llseek, 228 .llseek = generic_file_llseek,
230}; 229};
231 230
232int udf_setattr(struct dentry *dentry, struct iattr *iattr)
233{
234 struct inode *inode = dentry->d_inode;
235 int error;
236
237 error = inode_change_ok(inode, iattr);
238 if (error)
239 return error;
240
241 if (is_quota_modification(inode, iattr))
242 dquot_initialize(inode);
243
244 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
245 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
246 error = dquot_transfer(inode, iattr);
247 if (error)
248 return error;
249 }
250
251 return inode_setattr(inode, iattr);
252}
253
254const struct inode_operations udf_file_inode_operations = { 231const struct inode_operations udf_file_inode_operations = {
255 .truncate = udf_truncate, 232 .truncate = udf_truncate,
256 .setattr = udf_setattr,
257}; 233};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 2b5586c7f02a..18cd7111185d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
20 20
21#include "udfdecl.h" 21#include "udfdecl.h"
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include <linux/sched.h> 23#include <linux/sched.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26 25
@@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode)
32 struct super_block *sb = inode->i_sb; 31 struct super_block *sb = inode->i_sb;
33 struct udf_sb_info *sbi = UDF_SB(sb); 32 struct udf_sb_info *sbi = UDF_SB(sb);
34 33
35 /*
36 * Note: we must free any quota before locking the superblock,
37 * as writing the quota to disk may need the lock as well.
38 */
39 dquot_free_inode(inode);
40 dquot_drop(inode);
41
42 clear_inode(inode); 34 clear_inode(inode);
43 35
44 mutex_lock(&sbi->s_alloc_mutex); 36 mutex_lock(&sbi->s_alloc_mutex);
@@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
61 struct super_block *sb = dir->i_sb; 53 struct super_block *sb = dir->i_sb;
62 struct udf_sb_info *sbi = UDF_SB(sb); 54 struct udf_sb_info *sbi = UDF_SB(sb);
63 struct inode *inode; 55 struct inode *inode;
64 int block, ret; 56 int block;
65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; 57 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
66 struct udf_inode_info *iinfo; 58 struct udf_inode_info *iinfo;
67 struct udf_inode_info *dinfo = UDF_I(dir); 59 struct udf_inode_info *dinfo = UDF_I(dir);
@@ -146,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
146 insert_inode_hash(inode); 138 insert_inode_hash(inode);
147 mark_inode_dirty(inode); 139 mark_inode_dirty(inode);
148 140
149 dquot_initialize(inode);
150 ret = dquot_alloc_inode(inode);
151 if (ret) {
152 dquot_drop(inode);
153 inode->i_flags |= S_NOQUOTA;
154 inode->i_nlink = 0;
155 iput(inode);
156 *err = ret;
157 return NULL;
158 }
159
160 *err = 0; 141 *err = 0;
161 return inode; 142 return inode;
162} 143}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8a3fbd177cab..124852bcf6fe 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/quotaops.h>
40#include <linux/slab.h> 39#include <linux/slab.h>
41#include <linux/crc-itu-t.h> 40#include <linux/crc-itu-t.h>
42 41
@@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
71 70
72void udf_delete_inode(struct inode *inode) 71void udf_delete_inode(struct inode *inode)
73{ 72{
74 if (!is_bad_inode(inode))
75 dquot_initialize(inode);
76
77 truncate_inode_pages(&inode->i_data, 0); 73 truncate_inode_pages(&inode->i_data, 0);
78 74
79 if (is_bad_inode(inode)) 75 if (is_bad_inode(inode))
@@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode)
113 (unsigned long long)iinfo->i_lenExtents); 109 (unsigned long long)iinfo->i_lenExtents);
114 } 110 }
115 111
116 dquot_drop(inode);
117 kfree(iinfo->i_ext.i_data); 112 kfree(iinfo->i_ext.i_data);
118 iinfo->i_ext.i_data = NULL; 113 iinfo->i_ext.i_data = NULL;
119} 114}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 585f733615dc..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/quotaops.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
33#include <linux/sched.h> 32#include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
563 int err; 562 int err;
564 struct udf_inode_info *iinfo; 563 struct udf_inode_info *iinfo;
565 564
566 dquot_initialize(dir);
567
568 lock_kernel(); 565 lock_kernel();
569 inode = udf_new_inode(dir, mode, &err); 566 inode = udf_new_inode(dir, mode, &err);
570 if (!inode) { 567 if (!inode) {
@@ -617,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
617 if (!old_valid_dev(rdev)) 614 if (!old_valid_dev(rdev))
618 return -EINVAL; 615 return -EINVAL;
619 616
620 dquot_initialize(dir);
621
622 lock_kernel(); 617 lock_kernel();
623 err = -EIO; 618 err = -EIO;
624 inode = udf_new_inode(dir, mode, &err); 619 inode = udf_new_inode(dir, mode, &err);
@@ -664,8 +659,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
664 struct udf_inode_info *dinfo = UDF_I(dir); 659 struct udf_inode_info *dinfo = UDF_I(dir);
665 struct udf_inode_info *iinfo; 660 struct udf_inode_info *iinfo;
666 661
667 dquot_initialize(dir);
668
669 lock_kernel(); 662 lock_kernel();
670 err = -EMLINK; 663 err = -EMLINK;
671 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 664 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -800,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
800 struct fileIdentDesc *fi, cfi; 793 struct fileIdentDesc *fi, cfi;
801 struct kernel_lb_addr tloc; 794 struct kernel_lb_addr tloc;
802 795
803 dquot_initialize(dir);
804
805 retval = -ENOENT; 796 retval = -ENOENT;
806 lock_kernel(); 797 lock_kernel();
807 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 798 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -848,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
848 struct fileIdentDesc cfi; 839 struct fileIdentDesc cfi;
849 struct kernel_lb_addr tloc; 840 struct kernel_lb_addr tloc;
850 841
851 dquot_initialize(dir);
852
853 retval = -ENOENT; 842 retval = -ENOENT;
854 lock_kernel(); 843 lock_kernel();
855 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 844 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -904,8 +893,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
904 struct buffer_head *bh; 893 struct buffer_head *bh;
905 struct udf_inode_info *iinfo; 894 struct udf_inode_info *iinfo;
906 895
907 dquot_initialize(dir);
908
909 lock_kernel(); 896 lock_kernel();
910 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); 897 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
911 if (!inode) 898 if (!inode)
@@ -1075,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1075 int err; 1062 int err;
1076 struct buffer_head *bh; 1063 struct buffer_head *bh;
1077 1064
1078 dquot_initialize(dir);
1079
1080 lock_kernel(); 1065 lock_kernel();
1081 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1066 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1082 unlock_kernel(); 1067 unlock_kernel();
@@ -1139,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1139 struct kernel_lb_addr tloc; 1124 struct kernel_lb_addr tloc;
1140 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1125 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1141 1126
1142 dquot_initialize(old_dir);
1143 dquot_initialize(new_dir);
1144
1145 lock_kernel(); 1127 lock_kernel();
1146 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1128 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1147 if (ofi) { 1129 if (ofi) {
@@ -1387,7 +1369,6 @@ const struct export_operations udf_export_ops = {
1387const struct inode_operations udf_dir_inode_operations = { 1369const struct inode_operations udf_dir_inode_operations = {
1388 .lookup = udf_lookup, 1370 .lookup = udf_lookup,
1389 .create = udf_create, 1371 .create = udf_create,
1390 .setattr = udf_setattr,
1391 .link = udf_link, 1372 .link = udf_link,
1392 .unlink = udf_unlink, 1373 .unlink = udf_unlink,
1393 .symlink = udf_symlink, 1374 .symlink = udf_symlink,
@@ -1400,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = {
1400 .readlink = generic_readlink, 1381 .readlink = generic_readlink,
1401 .follow_link = page_follow_link_light, 1382 .follow_link = page_follow_link_light,
1402 .put_link = page_put_link, 1383 .put_link = page_put_link,
1403 .setattr = udf_setattr,
1404}; 1384};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..612d1e2e285a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
557{ 557{
558 struct udf_options uopt; 558 struct udf_options uopt;
559 struct udf_sb_info *sbi = UDF_SB(sb); 559 struct udf_sb_info *sbi = UDF_SB(sb);
560 int error = 0;
560 561
561 uopt.flags = sbi->s_flags; 562 uopt.flags = sbi->s_flags;
562 uopt.uid = sbi->s_uid; 563 uopt.uid = sbi->s_uid;
@@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
582 *flags |= MS_RDONLY; 583 *flags |= MS_RDONLY;
583 } 584 }
584 585
585 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 586 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
586 unlock_kernel(); 587 goto out_unlock;
587 return 0; 588
588 }
589 if (*flags & MS_RDONLY) 589 if (*flags & MS_RDONLY)
590 udf_close_lvid(sb); 590 udf_close_lvid(sb);
591 else 591 else
592 udf_open_lvid(sb); 592 udf_open_lvid(sb);
593 593
594out_unlock:
594 unlock_kernel(); 595 unlock_kernel();
595 return 0; 596 return error;
596} 597}
597 598
598/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ 599/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1939 /* Fill in the rest of the superblock */ 1940 /* Fill in the rest of the superblock */
1940 sb->s_op = &udf_sb_ops; 1941 sb->s_op = &udf_sb_ops;
1941 sb->s_export_op = &udf_export_ops; 1942 sb->s_export_op = &udf_export_ops;
1942 sb->dq_op = NULL; 1943
1943 sb->s_dirt = 0; 1944 sb->s_dirt = 0;
1944 sb->s_magic = UDF_SUPER_MAGIC; 1945 sb->s_magic = UDF_SUPER_MAGIC;
1945 sb->s_time_gran = 1000; 1946 sb->s_time_gran = 1000;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 9079ff7d6255..2bac0354891f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -131,7 +131,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
131 131
132/* file.c */ 132/* file.c */
133extern long udf_ioctl(struct file *, unsigned int, unsigned long); 133extern long udf_ioctl(struct file *, unsigned int, unsigned long);
134extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
135/* inode.c */ 134/* inode.c */
136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 135extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
137extern int udf_sync_inode(struct inode *); 136extern int udf_sync_inode(struct inode *);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..048484fb10d2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
12#include <linux/stat.h> 12#include <linux/stat.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/quotaops.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
85 "bit already cleared for fragment %u", i); 84 "bit already cleared for fragment %u", i);
86 } 85 }
87 86
88 dquot_free_block(inode, count);
89
90
91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 87 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
92 uspi->cs_total.cs_nffree += count; 88 uspi->cs_total.cs_nffree += count;
93 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 89 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -195,7 +191,6 @@ do_more:
195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 191 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 192 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
197 ufs_clusteracct (sb, ucpi, blkno, 1); 193 ufs_clusteracct (sb, ucpi, blkno, 1);
198 dquot_free_block(inode, uspi->s_fpb);
199 194
200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 195 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
201 uspi->cs_total.cs_nbfree++; 196 uspi->cs_total.cs_nbfree++;
@@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
511 struct ufs_cg_private_info * ucpi; 506 struct ufs_cg_private_info * ucpi;
512 struct ufs_cylinder_group * ucg; 507 struct ufs_cylinder_group * ucg;
513 unsigned cgno, fragno, fragoff, count, fragsize, i; 508 unsigned cgno, fragno, fragoff, count, fragsize, i;
514 int ret;
515 509
516 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", 510 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
517 (unsigned long long)fragment, oldcount, newcount); 511 (unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
557 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 551 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
558 for (i = oldcount; i < newcount; i++) 552 for (i = oldcount; i < newcount; i++)
559 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); 553 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
560 ret = dquot_alloc_block(inode, count);
561 if (ret) {
562 *err = ret;
563 return 0;
564 }
565 554
566 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); 555 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
567 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 556 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
598 struct ufs_cylinder_group * ucg; 587 struct ufs_cylinder_group * ucg;
599 unsigned oldcg, i, j, k, allocsize; 588 unsigned oldcg, i, j, k, allocsize;
600 u64 result; 589 u64 result;
601 int ret;
602 590
603 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", 591 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
604 inode->i_ino, cgno, (unsigned long long)goal, count); 592 inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +655,6 @@ cg_found:
667 for (i = count; i < uspi->s_fpb; i++) 655 for (i = count; i < uspi->s_fpb; i++)
668 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); 656 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
669 i = uspi->s_fpb - count; 657 i = uspi->s_fpb - count;
670 dquot_free_block(inode, i);
671 658
672 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 659 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
673 uspi->cs_total.cs_nffree += i; 660 uspi->cs_total.cs_nffree += i;
@@ -679,11 +666,6 @@ cg_found:
679 result = ufs_bitmap_search (sb, ucpi, goal, allocsize); 666 result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
680 if (result == INVBLOCK) 667 if (result == INVBLOCK)
681 return 0; 668 return 0;
682 ret = dquot_alloc_block(inode, count);
683 if (ret) {
684 *err = ret;
685 return 0;
686 }
687 for (i = 0; i < count; i++) 669 for (i = 0; i < count; i++)
688 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); 670 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
689 671
@@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
718 struct ufs_super_block_first * usb1; 700 struct ufs_super_block_first * usb1;
719 struct ufs_cylinder_group * ucg; 701 struct ufs_cylinder_group * ucg;
720 u64 result, blkno; 702 u64 result, blkno;
721 int ret;
722 703
723 UFSD("ENTER, goal %llu\n", (unsigned long long)goal); 704 UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
724 705
@@ -752,11 +733,6 @@ gotit:
752 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 733 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
753 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 734 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
754 ufs_clusteracct (sb, ucpi, blkno, -1); 735 ufs_clusteracct (sb, ucpi, blkno, -1);
755 ret = dquot_alloc_block(inode, uspi->s_fpb);
756 if (ret) {
757 *err = ret;
758 return INVBLOCK;
759 }
760 736
761 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); 737 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
762 uspi->cs_total.cs_nbfree--; 738 uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..ec784756dc65 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
666const struct file_operations ufs_dir_operations = { 666const struct file_operations ufs_dir_operations = {
667 .read = generic_read_dir, 667 .read = generic_read_dir,
668 .readdir = ufs_readdir, 668 .readdir = ufs_readdir,
669 .fsync = simple_fsync, 669 .fsync = generic_file_fsync,
670 .llseek = generic_file_llseek, 670 .llseek = generic_file_llseek,
671}; 671};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/quotaops.h>
28 27
29#include "ufs_fs.h" 28#include "ufs_fs.h"
30#include "ufs.h" 29#include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
41 .write = do_sync_write, 40 .write = do_sync_write,
42 .aio_write = generic_file_aio_write, 41 .aio_write = generic_file_aio_write,
43 .mmap = generic_file_mmap, 42 .mmap = generic_file_mmap,
44 .open = dquot_file_open, 43 .open = generic_file_open,
45 .fsync = simple_fsync, 44 .fsync = generic_file_fsync,
46 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
47}; 46};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3a959d55084d..594480e537d2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/string.h> 29#include <linux/string.h>
30#include <linux/quotaops.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/sched.h> 31#include <linux/sched.h>
33#include <linux/bitops.h> 32#include <linux/bitops.h>
@@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode)
95 94
96 is_directory = S_ISDIR(inode->i_mode); 95 is_directory = S_ISDIR(inode->i_mode);
97 96
98 dquot_free_inode(inode);
99 dquot_drop(inode);
100
101 clear_inode (inode); 97 clear_inode (inode);
102 98
103 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) 99 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
@@ -347,21 +343,12 @@ cg_found:
347 343
348 unlock_super (sb); 344 unlock_super (sb);
349 345
350 dquot_initialize(inode);
351 err = dquot_alloc_inode(inode);
352 if (err) {
353 dquot_drop(inode);
354 goto fail_without_unlock;
355 }
356
357 UFSD("allocating inode %lu\n", inode->i_ino); 346 UFSD("allocating inode %lu\n", inode->i_ino);
358 UFSD("EXIT\n"); 347 UFSD("EXIT\n");
359 return inode; 348 return inode;
360 349
361fail_remove_inode: 350fail_remove_inode:
362 unlock_super(sb); 351 unlock_super(sb);
363fail_without_unlock:
364 inode->i_flags |= S_NOQUOTA;
365 inode->i_nlink = 0; 352 inode->i_nlink = 0;
366 iput(inode); 353 iput(inode);
367 UFSD("EXIT (FAILED): err %d\n", err); 354 UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index cffa756f1047..73fe773aa034 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
37#include <linux/smp_lock.h> 37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40#include <linux/quotaops.h>
41 40
42#include "ufs_fs.h" 41#include "ufs_fs.h"
43#include "ufs.h" 42#include "ufs.h"
@@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode)
910{ 909{
911 loff_t old_i_size; 910 loff_t old_i_size;
912 911
913 if (!is_bad_inode(inode))
914 dquot_initialize(inode);
915
916 truncate_inode_pages(&inode->i_data, 0); 912 truncate_inode_pages(&inode->i_data, 0);
917 if (is_bad_inode(inode)) 913 if (is_bad_inode(inode))
918 goto no_delete; 914 goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index eabc02eb1294..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
33#include <linux/quotaops.h>
34 33
35#include "ufs_fs.h" 34#include "ufs_fs.h"
36#include "ufs.h" 35#include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
86 85
87 UFSD("BEGIN\n"); 86 UFSD("BEGIN\n");
88 87
89 dquot_initialize(dir);
90
91 inode = ufs_new_inode(dir, mode); 88 inode = ufs_new_inode(dir, mode);
92 err = PTR_ERR(inode); 89 err = PTR_ERR(inode);
93 90
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
112 if (!old_valid_dev(rdev)) 109 if (!old_valid_dev(rdev))
113 return -EINVAL; 110 return -EINVAL;
114 111
115 dquot_initialize(dir);
116
117 inode = ufs_new_inode(dir, mode); 112 inode = ufs_new_inode(dir, mode);
118 err = PTR_ERR(inode); 113 err = PTR_ERR(inode);
119 if (!IS_ERR(inode)) { 114 if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
138 if (l > sb->s_blocksize) 133 if (l > sb->s_blocksize)
139 goto out_notlocked; 134 goto out_notlocked;
140 135
141 dquot_initialize(dir);
142
143 lock_kernel(); 136 lock_kernel();
144 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 137 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
145 err = PTR_ERR(inode); 138 err = PTR_ERR(inode);
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
185 return -EMLINK; 178 return -EMLINK;
186 } 179 }
187 180
188 dquot_initialize(dir);
189
190 inode->i_ctime = CURRENT_TIME_SEC; 181 inode->i_ctime = CURRENT_TIME_SEC;
191 inode_inc_link_count(inode); 182 inode_inc_link_count(inode);
192 atomic_inc(&inode->i_count); 183 atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
204 if (dir->i_nlink >= UFS_LINK_MAX) 195 if (dir->i_nlink >= UFS_LINK_MAX)
205 goto out; 196 goto out;
206 197
207 dquot_initialize(dir);
208
209 lock_kernel(); 198 lock_kernel();
210 inode_inc_link_count(dir); 199 inode_inc_link_count(dir);
211 200
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
250 struct page *page; 239 struct page *page;
251 int err = -ENOENT; 240 int err = -ENOENT;
252 241
253 dquot_initialize(dir);
254
255 de = ufs_find_entry(dir, &dentry->d_name, &page); 242 de = ufs_find_entry(dir, &dentry->d_name, &page);
256 if (!de) 243 if (!de)
257 goto out; 244 goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
296 struct ufs_dir_entry *old_de; 283 struct ufs_dir_entry *old_de;
297 int err = -ENOENT; 284 int err = -ENOENT;
298 285
299 dquot_initialize(old_dir);
300 dquot_initialize(new_dir);
301
302 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 286 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
303 if (!old_de) 287 if (!old_de)
304 goto out; 288 goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..3ec5a9eb6efb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
77 77
78#include <linux/errno.h> 78#include <linux/errno.h>
79#include <linux/fs.h> 79#include <linux/fs.h>
80#include <linux/quotaops.h>
81#include <linux/slab.h> 80#include <linux/slab.h>
82#include <linux/time.h> 81#include <linux/time.h>
83#include <linux/stat.h> 82#include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
918 sbi->s_bytesex = BYTESEX_LE; 917 sbi->s_bytesex = BYTESEX_LE;
919 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 918 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
920 case UFS_MAGIC: 919 case UFS_MAGIC:
920 case UFS_MAGIC_BW:
921 case UFS2_MAGIC: 921 case UFS2_MAGIC:
922 case UFS_MAGIC_LFN: 922 case UFS_MAGIC_LFN:
923 case UFS_MAGIC_FEA: 923 case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
927 sbi->s_bytesex = BYTESEX_BE; 927 sbi->s_bytesex = BYTESEX_BE;
928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
929 case UFS_MAGIC: 929 case UFS_MAGIC:
930 case UFS_MAGIC_BW:
930 case UFS2_MAGIC: 931 case UFS2_MAGIC:
931 case UFS_MAGIC_LFN: 932 case UFS_MAGIC_LFN:
932 case UFS_MAGIC_FEA: 933 case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
1045 */ 1046 */
1046 sb->s_op = &ufs_super_ops; 1047 sb->s_op = &ufs_super_ops;
1047 sb->s_export_op = &ufs_export_ops; 1048 sb->s_export_op = &ufs_export_ops;
1048 sb->dq_op = NULL; /***/ 1049
1049 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); 1050 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
1050 1051
1051 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); 1052 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
1435 kmem_cache_destroy(ufs_inode_cachep); 1436 kmem_cache_destroy(ufs_inode_cachep);
1436} 1437}
1437 1438
1438static void ufs_clear_inode(struct inode *inode)
1439{
1440 dquot_drop(inode);
1441}
1442
1443#ifdef CONFIG_QUOTA
1444static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
1445static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
1446#endif
1447
1448static const struct super_operations ufs_super_ops = { 1439static const struct super_operations ufs_super_ops = {
1449 .alloc_inode = ufs_alloc_inode, 1440 .alloc_inode = ufs_alloc_inode,
1450 .destroy_inode = ufs_destroy_inode, 1441 .destroy_inode = ufs_destroy_inode,
1451 .write_inode = ufs_write_inode, 1442 .write_inode = ufs_write_inode,
1452 .delete_inode = ufs_delete_inode, 1443 .delete_inode = ufs_delete_inode,
1453 .clear_inode = ufs_clear_inode,
1454 .put_super = ufs_put_super, 1444 .put_super = ufs_put_super,
1455 .write_super = ufs_write_super, 1445 .write_super = ufs_write_super,
1456 .sync_fs = ufs_sync_fs, 1446 .sync_fs = ufs_sync_fs,
1457 .statfs = ufs_statfs, 1447 .statfs = ufs_statfs,
1458 .remount_fs = ufs_remount, 1448 .remount_fs = ufs_remount,
1459 .show_options = ufs_show_options, 1449 .show_options = ufs_show_options,
1460#ifdef CONFIG_QUOTA
1461 .quota_read = ufs_quota_read,
1462 .quota_write = ufs_quota_write,
1463#endif
1464}; 1450};
1465 1451
1466#ifdef CONFIG_QUOTA
1467
1468/* Read data from quotafile - avoid pagecache and such because we cannot afford
1469 * acquiring the locks... As quota files are never truncated and quota code
1470 * itself serializes the operations (and noone else should touch the files)
1471 * we don't have to be afraid of races */
1472static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
1473 size_t len, loff_t off)
1474{
1475 struct inode *inode = sb_dqopt(sb)->files[type];
1476 sector_t blk = off >> sb->s_blocksize_bits;
1477 int err = 0;
1478 int offset = off & (sb->s_blocksize - 1);
1479 int tocopy;
1480 size_t toread;
1481 struct buffer_head *bh;
1482 loff_t i_size = i_size_read(inode);
1483
1484 if (off > i_size)
1485 return 0;
1486 if (off+len > i_size)
1487 len = i_size-off;
1488 toread = len;
1489 while (toread > 0) {
1490 tocopy = sb->s_blocksize - offset < toread ?
1491 sb->s_blocksize - offset : toread;
1492
1493 bh = ufs_bread(inode, blk, 0, &err);
1494 if (err)
1495 return err;
1496 if (!bh) /* A hole? */
1497 memset(data, 0, tocopy);
1498 else {
1499 memcpy(data, bh->b_data+offset, tocopy);
1500 brelse(bh);
1501 }
1502 offset = 0;
1503 toread -= tocopy;
1504 data += tocopy;
1505 blk++;
1506 }
1507 return len;
1508}
1509
1510/* Write to quotafile */
1511static ssize_t ufs_quota_write(struct super_block *sb, int type,
1512 const char *data, size_t len, loff_t off)
1513{
1514 struct inode *inode = sb_dqopt(sb)->files[type];
1515 sector_t blk = off >> sb->s_blocksize_bits;
1516 int err = 0;
1517 int offset = off & (sb->s_blocksize - 1);
1518 int tocopy;
1519 size_t towrite = len;
1520 struct buffer_head *bh;
1521
1522 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
1523 while (towrite > 0) {
1524 tocopy = sb->s_blocksize - offset < towrite ?
1525 sb->s_blocksize - offset : towrite;
1526
1527 bh = ufs_bread(inode, blk, 1, &err);
1528 if (!bh)
1529 goto out;
1530 lock_buffer(bh);
1531 memcpy(bh->b_data+offset, data, tocopy);
1532 flush_dcache_page(bh->b_page);
1533 set_buffer_uptodate(bh);
1534 mark_buffer_dirty(bh);
1535 unlock_buffer(bh);
1536 brelse(bh);
1537 offset = 0;
1538 towrite -= tocopy;
1539 data += tocopy;
1540 blk++;
1541 }
1542out:
1543 if (len == towrite) {
1544 mutex_unlock(&inode->i_mutex);
1545 return err;
1546 }
1547 if (inode->i_size < off+len-towrite)
1548 i_size_write(inode, off+len-towrite);
1549 inode->i_version++;
1550 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1551 mark_inode_dirty(inode);
1552 mutex_unlock(&inode->i_mutex);
1553 return len - towrite;
1554}
1555
1556#endif
1557
1558static int ufs_get_sb(struct file_system_type *fs_type, 1452static int ufs_get_sb(struct file_system_type *fs_type,
1559 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1453 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
1560{ 1454{
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index f294c44577dc..589e01a465ba 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
44#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/quotaops.h>
48 47
49#include "ufs_fs.h" 48#include "ufs_fs.h"
50#include "ufs.h" 49#include "ufs.h"
@@ -501,12 +500,10 @@ out:
501 return err; 500 return err;
502} 501}
503 502
504
505/* 503/*
506 * We don't define our `inode->i_op->truncate', and call it here, 504 * TODO:
507 * because of: 505 * - truncate case should use proper ordering instead of using
508 * - there is no way to know old size 506 * simple_setsize
509 * - there is no way inform user about error, if it happens in `truncate'
510 */ 507 */
511int ufs_setattr(struct dentry *dentry, struct iattr *attr) 508int ufs_setattr(struct dentry *dentry, struct iattr *attr)
512{ 509{
@@ -518,19 +515,10 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
518 if (error) 515 if (error)
519 return error; 516 return error;
520 517
521 if (is_quota_modification(inode, attr))
522 dquot_initialize(inode);
523
524 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
525 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
526 error = dquot_transfer(inode, attr);
527 if (error)
528 return error;
529 }
530 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { 518 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
531 loff_t old_i_size = inode->i_size; 519 loff_t old_i_size = inode->i_size;
532 520
533 error = vmtruncate(inode, attr->ia_size); 521 error = simple_setsize(inode, attr->ia_size);
534 if (error) 522 if (error)
535 return error; 523 return error;
536 error = ufs_truncate(inode, old_i_size); 524 error = ufs_truncate(inode, old_i_size);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
48#define UFS_SECTOR_SIZE 512 48#define UFS_SECTOR_SIZE 512
49#define UFS_SECTOR_BITS 9 49#define UFS_SECTOR_BITS 9
50#define UFS_MAGIC 0x00011954 50#define UFS_MAGIC 0x00011954
51#define UFS_MAGIC_BW 0x0f242697
51#define UFS2_MAGIC 0x19540119 52#define UFS2_MAGIC 0x19540119
52#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ 53#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */
53 54
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
77 xfs_itable.o \ 77 xfs_itable.o \
78 xfs_dfrag.o \ 78 xfs_dfrag.o \
79 xfs_log.o \ 79 xfs_log.o \
80 xfs_log_cil.o \
80 xfs_log_recover.o \ 81 xfs_log_recover.o \
81 xfs_mount.o \ 82 xfs_mount.o \
82 xfs_mru_cache.o \ 83 xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index f01de3c55c43..649ade8ef598 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
37 37
38#include "xfs_sb.h" 38#include "xfs_sb.h"
39#include "xfs_inum.h" 39#include "xfs_inum.h"
40#include "xfs_log.h"
40#include "xfs_ag.h" 41#include "xfs_ag.h"
41#include "xfs_dmapi.h" 42#include "xfs_dmapi.h"
42#include "xfs_mount.h" 43#include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
850 * Note that this in no way locks the underlying pages, so it is only 851 * Note that this in no way locks the underlying pages, so it is only
851 * useful for synchronizing concurrent use of buffer objects, not for 852 * useful for synchronizing concurrent use of buffer objects, not for
852 * synchronizing independent access to the underlying pages. 853 * synchronizing independent access to the underlying pages.
854 *
855 * If we come across a stale, pinned, locked buffer, we know that we
856 * are being asked to lock a buffer that has been reallocated. Because
857 * it is pinned, we know that the log has not been pushed to disk and
858 * hence it will still be locked. Rather than sleeping until someone
859 * else pushes the log, push it ourselves before trying to get the lock.
853 */ 860 */
854void 861void
855xfs_buf_lock( 862xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
857{ 864{
858 trace_xfs_buf_lock(bp, _RET_IP_); 865 trace_xfs_buf_lock(bp, _RET_IP_);
859 866
867 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
868 xfs_log_force(bp->b_mount, 0);
860 if (atomic_read(&bp->b_io_remaining)) 869 if (atomic_read(&bp->b_io_remaining))
861 blk_run_address_space(bp->b_target->bt_mapping); 870 blk_run_address_space(bp->b_target->bt_mapping);
862 down(&bp->b_sema); 871 down(&bp->b_sema);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index d8fb1b5d6cb5..257a56b127cf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -100,10 +100,10 @@ xfs_iozero(
100STATIC int 100STATIC int
101xfs_file_fsync( 101xfs_file_fsync(
102 struct file *file, 102 struct file *file,
103 struct dentry *dentry,
104 int datasync) 103 int datasync)
105{ 104{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode); 105 struct inode *inode = file->f_mapping->host;
106 struct xfs_inode *ip = XFS_I(inode);
107 struct xfs_trans *tp; 107 struct xfs_trans *tp;
108 int error = 0; 108 int error = 0;
109 int log_flushed = 0; 109 int log_flushed = 0;
@@ -140,8 +140,8 @@ xfs_file_fsync(
140 * might gets cleared when the inode gets written out via the AIL 140 * might gets cleared when the inode gets written out via the AIL
141 * or xfs_iflush_cluster. 141 * or xfs_iflush_cluster.
142 */ 142 */
143 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || 143 if (((inode->i_state & I_DIRTY_DATASYNC) ||
144 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && 144 ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
145 ip->i_update_core) { 145 ip->i_update_core) {
146 /* 146 /*
147 * Kick off a transaction to log the inode core to get the 147 * Kick off a transaction to log the inode core to get the
@@ -868,7 +868,7 @@ write_retry:
868 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
869 xfs_ilock(ip, iolock); 869 xfs_ilock(ip, iolock);
870 870
871 error2 = -xfs_file_fsync(file, file->f_path.dentry, 871 error2 = -xfs_file_fsync(file,
872 (file->f_flags & __O_SYNC) ? 0 : 1); 872 (file->f_flags & __O_SYNC) ? 0 : 1);
873 if (!error) 873 if (!error)
874 error = error2; 874 error = error2;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index e31bf21fe5d3..9ac8aea91529 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
19#include "xfs_dmapi.h" 19#include "xfs_dmapi.h"
20#include "xfs_sb.h" 20#include "xfs_sb.h"
21#include "xfs_inum.h" 21#include "xfs_inum.h"
22#include "xfs_log.h"
22#include "xfs_ag.h" 23#include "xfs_ag.h"
23#include "xfs_mount.h" 24#include "xfs_mount.h"
24#include "xfs_quota.h" 25#include "xfs_quota.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f24dbe5efde3..f2d1718c9165 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ 119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ 120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ 121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
122#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
123#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
122 124
123/* 125/*
124 * Table driven mount option parser. 126 * Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
374 mp->m_flags |= XFS_MOUNT_DMAPI; 376 mp->m_flags |= XFS_MOUNT_DMAPI;
375 } else if (!strcmp(this_char, MNTOPT_DMI)) { 377 } else if (!strcmp(this_char, MNTOPT_DMI)) {
376 mp->m_flags |= XFS_MOUNT_DMAPI; 378 mp->m_flags |= XFS_MOUNT_DMAPI;
379 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
380 mp->m_flags |= XFS_MOUNT_DELAYLOG;
381 cmn_err(CE_WARN,
382 "Enabling EXPERIMENTAL delayed logging feature "
383 "- use at your own risk.\n");
384 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
385 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
377 } else if (!strcmp(this_char, "ihashsize")) { 386 } else if (!strcmp(this_char, "ihashsize")) {
378 cmn_err(CE_WARN, 387 cmn_err(CE_WARN,
379 "XFS: ihashsize no longer used, option is deprecated."); 388 "XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
535 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 544 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
536 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, 545 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
537 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 546 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
547 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
538 { 0, NULL } 548 { 0, NULL }
539 }; 549 };
540 static struct proc_xfs_info xfs_info_unset[] = { 550 static struct proc_xfs_info xfs_info_unset[] = {
@@ -1755,7 +1765,7 @@ xfs_init_zones(void)
1755 * but it is much faster. 1765 * but it is much faster.
1756 */ 1766 */
1757 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + 1767 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
1758 (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / 1768 (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
1759 NBWORD) * sizeof(int))), "xfs_buf_item"); 1769 NBWORD) * sizeof(int))), "xfs_buf_item");
1760 if (!xfs_buf_item_zone) 1770 if (!xfs_buf_item_zone)
1761 goto out_destroy_trans_zone; 1771 goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8a319cfd2901..ff6bc797baf2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
1059 1059
1060); 1060);
1061 1061
1062#define XFS_BUSY_SYNC \
1063 { 0, "async" }, \
1064 { 1, "sync" }
1065
1062TRACE_EVENT(xfs_alloc_busy, 1066TRACE_EVENT(xfs_alloc_busy,
1063 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1067 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1064 xfs_extlen_t len, int slot), 1068 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1065 TP_ARGS(mp, agno, agbno, len, slot), 1069 TP_ARGS(trans, agno, agbno, len, sync),
1066 TP_STRUCT__entry( 1070 TP_STRUCT__entry(
1067 __field(dev_t, dev) 1071 __field(dev_t, dev)
1072 __field(struct xfs_trans *, tp)
1073 __field(int, tid)
1068 __field(xfs_agnumber_t, agno) 1074 __field(xfs_agnumber_t, agno)
1069 __field(xfs_agblock_t, agbno) 1075 __field(xfs_agblock_t, agbno)
1070 __field(xfs_extlen_t, len) 1076 __field(xfs_extlen_t, len)
1071 __field(int, slot) 1077 __field(int, sync)
1072 ), 1078 ),
1073 TP_fast_assign( 1079 TP_fast_assign(
1074 __entry->dev = mp->m_super->s_dev; 1080 __entry->dev = trans->t_mountp->m_super->s_dev;
1081 __entry->tp = trans;
1082 __entry->tid = trans->t_ticket->t_tid;
1075 __entry->agno = agno; 1083 __entry->agno = agno;
1076 __entry->agbno = agbno; 1084 __entry->agbno = agbno;
1077 __entry->len = len; 1085 __entry->len = len;
1078 __entry->slot = slot; 1086 __entry->sync = sync;
1079 ), 1087 ),
1080 TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", 1088 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1081 MAJOR(__entry->dev), MINOR(__entry->dev), 1089 MAJOR(__entry->dev), MINOR(__entry->dev),
1090 __entry->tp,
1091 __entry->tid,
1082 __entry->agno, 1092 __entry->agno,
1083 __entry->agbno, 1093 __entry->agbno,
1084 __entry->len, 1094 __entry->len,
1085 __entry->slot) 1095 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1086 1096
1087); 1097);
1088 1098
1089#define XFS_BUSY_STATES \
1090 { 0, "found" }, \
1091 { 1, "missing" }
1092
1093TRACE_EVENT(xfs_alloc_unbusy, 1099TRACE_EVENT(xfs_alloc_unbusy,
1094 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1100 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1095 int slot, int found), 1101 xfs_agblock_t agbno, xfs_extlen_t len),
1096 TP_ARGS(mp, agno, slot, found), 1102 TP_ARGS(mp, agno, agbno, len),
1097 TP_STRUCT__entry( 1103 TP_STRUCT__entry(
1098 __field(dev_t, dev) 1104 __field(dev_t, dev)
1099 __field(xfs_agnumber_t, agno) 1105 __field(xfs_agnumber_t, agno)
1100 __field(int, slot) 1106 __field(xfs_agblock_t, agbno)
1101 __field(int, found) 1107 __field(xfs_extlen_t, len)
1102 ), 1108 ),
1103 TP_fast_assign( 1109 TP_fast_assign(
1104 __entry->dev = mp->m_super->s_dev; 1110 __entry->dev = mp->m_super->s_dev;
1105 __entry->agno = agno; 1111 __entry->agno = agno;
1106 __entry->slot = slot; 1112 __entry->agbno = agbno;
1107 __entry->found = found; 1113 __entry->len = len;
1108 ), 1114 ),
1109 TP_printk("dev %d:%d agno %u slot %d %s", 1115 TP_printk("dev %d:%d agno %u agbno %u len %u",
1110 MAJOR(__entry->dev), MINOR(__entry->dev), 1116 MAJOR(__entry->dev), MINOR(__entry->dev),
1111 __entry->agno, 1117 __entry->agno,
1112 __entry->slot, 1118 __entry->agbno,
1113 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1119 __entry->len)
1114); 1120);
1115 1121
1122#define XFS_BUSY_STATES \
1123 { 0, "missing" }, \
1124 { 1, "found" }
1125
1116TRACE_EVENT(xfs_alloc_busysearch, 1126TRACE_EVENT(xfs_alloc_busysearch,
1117 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1127 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1118 xfs_extlen_t len, xfs_lsn_t lsn), 1128 xfs_agblock_t agbno, xfs_extlen_t len, int found),
1119 TP_ARGS(mp, agno, agbno, len, lsn), 1129 TP_ARGS(mp, agno, agbno, len, found),
1120 TP_STRUCT__entry( 1130 TP_STRUCT__entry(
1121 __field(dev_t, dev) 1131 __field(dev_t, dev)
1122 __field(xfs_agnumber_t, agno) 1132 __field(xfs_agnumber_t, agno)
1123 __field(xfs_agblock_t, agbno) 1133 __field(xfs_agblock_t, agbno)
1124 __field(xfs_extlen_t, len) 1134 __field(xfs_extlen_t, len)
1125 __field(xfs_lsn_t, lsn) 1135 __field(int, found)
1126 ), 1136 ),
1127 TP_fast_assign( 1137 TP_fast_assign(
1128 __entry->dev = mp->m_super->s_dev; 1138 __entry->dev = mp->m_super->s_dev;
1129 __entry->agno = agno; 1139 __entry->agno = agno;
1130 __entry->agbno = agbno; 1140 __entry->agbno = agbno;
1131 __entry->len = len; 1141 __entry->len = len;
1132 __entry->lsn = lsn; 1142 __entry->found = found;
1133 ), 1143 ),
1134 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", 1144 TP_printk("dev %d:%d agno %u agbno %u len %u %s",
1135 MAJOR(__entry->dev), MINOR(__entry->dev), 1145 MAJOR(__entry->dev), MINOR(__entry->dev),
1136 __entry->agno, 1146 __entry->agno,
1137 __entry->agbno, 1147 __entry->agbno,
1138 __entry->len, 1148 __entry->len,
1149 __print_symbolic(__entry->found, XFS_BUSY_STATES))
1150);
1151
1152TRACE_EVENT(xfs_trans_commit_lsn,
1153 TP_PROTO(struct xfs_trans *trans),
1154 TP_ARGS(trans),
1155 TP_STRUCT__entry(
1156 __field(dev_t, dev)
1157 __field(struct xfs_trans *, tp)
1158 __field(xfs_lsn_t, lsn)
1159 ),
1160 TP_fast_assign(
1161 __entry->dev = trans->t_mountp->m_super->s_dev;
1162 __entry->tp = trans;
1163 __entry->lsn = trans->t_commit_lsn;
1164 ),
1165 TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
1166 MAJOR(__entry->dev), MINOR(__entry->dev),
1167 __entry->tp,
1139 __entry->lsn) 1168 __entry->lsn)
1140); 1169);
1141 1170
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index b89ec5df0129..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk(
344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) 344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
345 xfs_qm_dqinit_core(curid, type, d); 345 xfs_qm_dqinit_core(curid, type, d);
346 xfs_trans_dquot_buf(tp, bp, 346 xfs_trans_dquot_buf(tp, bp,
347 (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : 347 (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
348 ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : 348 ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
349 XFS_BLI_GDQUOT_BUF))); 349 XFS_BLF_GDQUOT_BUF)));
350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); 350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
351} 351}
352 352
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c9..401f364ad36c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
175} xfs_agfl_t; 175} xfs_agfl_t;
176 176
177/* 177/*
178 * Busy block/extent entry. Used in perag to mark blocks that have been freed 178 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
179 * but whose transactions aren't committed to disk yet. 179 * have been freed but whose transactions aren't committed to disk yet.
180 *
181 * Note that we use the transaction ID to record the transaction, not the
182 * transaction structure itself. See xfs_alloc_busy_insert() for details.
180 */ 183 */
181typedef struct xfs_perag_busy { 184struct xfs_busy_extent {
182 xfs_agblock_t busy_start; 185 struct rb_node rb_node; /* ag by-bno indexed search tree */
183 xfs_extlen_t busy_length; 186 struct list_head list; /* transaction busy extent list */
184 struct xfs_trans *busy_tp; /* transaction that did the free */ 187 xfs_agnumber_t agno;
185} xfs_perag_busy_t; 188 xfs_agblock_t bno;
189 xfs_extlen_t length;
190 xlog_tid_t tid; /* transaction that created this */
191};
186 192
187/* 193/*
188 * Per-ag incore structure, copies of information in agf and agi, 194 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,7 +222,8 @@ typedef struct xfs_perag {
216 xfs_agino_t pagl_leftrec; 222 xfs_agino_t pagl_leftrec;
217 xfs_agino_t pagl_rightrec; 223 xfs_agino_t pagl_rightrec;
218#ifdef __KERNEL__ 224#ifdef __KERNEL__
219 spinlock_t pagb_lock; /* lock for pagb_list */ 225 spinlock_t pagb_lock; /* lock for pagb_tree */
226 struct rb_root pagb_tree; /* ordered tree of busy extents */
220 227
221 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
222 229
@@ -226,7 +233,6 @@ typedef struct xfs_perag {
226 int pag_ici_reclaimable; /* reclaimable inodes */ 233 int pag_ici_reclaimable; /* reclaimable inodes */
227#endif 234#endif
228 int pagb_count; /* pagb slots in use */ 235 int pagb_count; /* pagb slots in use */
229 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
230} xfs_perag_t; 236} xfs_perag_t;
231 237
232/* 238/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
46#define XFSA_FIXUP_BNO_OK 1 46#define XFSA_FIXUP_BNO_OK 1
47#define XFSA_FIXUP_CNT_OK 2 47#define XFSA_FIXUP_CNT_OK 2
48 48
49STATIC void 49static int
50xfs_alloc_search_busy(xfs_trans_t *tp, 50xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
51 xfs_agnumber_t agno, 51 xfs_agblock_t bno, xfs_extlen_t len);
52 xfs_agblock_t bno,
53 xfs_extlen_t len);
54 52
55/* 53/*
56 * Prototypes for per-ag allocation routines 54 * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
540 be32_to_cpu(agf->agf_length)); 538 be32_to_cpu(agf->agf_length));
541 xfs_alloc_log_agf(args->tp, args->agbp, 539 xfs_alloc_log_agf(args->tp, args->agbp,
542 XFS_AGF_FREEBLKS); 540 XFS_AGF_FREEBLKS);
543 /* search the busylist for these blocks */ 541 /*
544 xfs_alloc_search_busy(args->tp, args->agno, 542 * Search the busylist for these blocks and mark the
545 args->agbno, args->len); 543 * transaction as synchronous if blocks are found. This
544 * avoids the need to block due to a synchronous log
545 * force to ensure correct ordering as the synchronous
546 * transaction will guarantee that for us.
547 */
548 if (xfs_alloc_busy_search(args->mp, args->agno,
549 args->agbno, args->len))
550 xfs_trans_set_sync(args->tp);
546 } 551 }
547 if (!args->isfl) 552 if (!args->isfl)
548 xfs_trans_mod_sb(args->tp, 553 xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
1693 * when the iclog commits to disk. If a busy block is allocated, 1698 * when the iclog commits to disk. If a busy block is allocated,
1694 * the iclog is pushed up to the LSN that freed the block. 1699 * the iclog is pushed up to the LSN that freed the block.
1695 */ 1700 */
1696 xfs_alloc_mark_busy(tp, agno, bno, len); 1701 xfs_alloc_busy_insert(tp, agno, bno, len);
1697 return 0; 1702 return 0;
1698 1703
1699 error0: 1704 error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
1989 *bnop = bno; 1994 *bnop = bno;
1990 1995
1991 /* 1996 /*
1992 * As blocks are freed, they are added to the per-ag busy list 1997 * As blocks are freed, they are added to the per-ag busy list and
1993 * and remain there until the freeing transaction is committed to 1998 * remain there until the freeing transaction is committed to disk.
1994 * disk. Now that we have allocated blocks, this list must be 1999 * Now that we have allocated blocks, this list must be searched to see
1995 * searched to see if a block is being reused. If one is, then 2000 * if a block is being reused. If one is, then the freeing transaction
1996 * the freeing transaction must be pushed to disk NOW by forcing 2001 * must be pushed to disk before this transaction.
1997 * to disk all iclogs up that transaction's LSN. 2002 *
2003 * We do this by setting the current transaction to a sync transaction
2004 * which guarantees that the freeing transaction is on disk before this
2005 * transaction. This is done instead of a synchronous log force here so
2006 * that we don't sit and wait with the AGF locked in the transaction
2007 * during the log force.
1998 */ 2008 */
1999 xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); 2009 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
2010 xfs_trans_set_sync(tp);
2000 return 0; 2011 return 0;
2001} 2012}
2002 2013
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2212 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2202 spin_lock_init(&pag->pagb_lock); 2213 spin_lock_init(&pag->pagb_lock);
2203 pag->pagb_count = 0; 2214 pag->pagb_count = 0;
2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); 2215 pag->pagb_tree = RB_ROOT;
2205 pag->pagf_init = 1; 2216 pag->pagf_init = 1;
2206 } 2217 }
2207#ifdef DEBUG 2218#ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
2479 * list is reused, the transaction that freed it must be forced to disk 2490 * list is reused, the transaction that freed it must be forced to disk
2480 * before continuing to use the block. 2491 * before continuing to use the block.
2481 * 2492 *
2482 * xfs_alloc_mark_busy - add to the per-ag busy list 2493 * xfs_alloc_busy_insert - add to the per-ag busy list
2483 * xfs_alloc_clear_busy - remove an item from the per-ag busy list 2494 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2495 * xfs_alloc_busy_search - search for a busy extent
2496 */
2497
2498/*
2499 * Insert a new extent into the busy tree.
2500 *
2501 * The busy extent tree is indexed by the start block of the busy extent.
2502 * there can be multiple overlapping ranges in the busy extent tree but only
2503 * ever one entry at a given start block. The reason for this is that
2504 * multi-block extents can be freed, then smaller chunks of that extent
2505 * allocated and freed again before the first transaction commit is on disk.
2506 * If the exact same start block is freed a second time, we have to wait for
2507 * that busy extent to pass out of the tree before the new extent is inserted.
2508 * There are two main cases we have to handle here.
2509 *
2510 * The first case is a transaction that triggers a "free - allocate - free"
2511 * cycle. This can occur during btree manipulations as a btree block is freed
2512 * to the freelist, then allocated from the free list, then freed again. In
2513 * this case, the second extxpnet free is what triggers the duplicate and as
2514 * such the transaction IDs should match. Because the extent was allocated in
2515 * this transaction, the transaction must be marked as synchronous. This is
2516 * true for all cases where the free/alloc/free occurs in the one transaction,
2517 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2518 * This serves to catch violations of the second case quite effectively.
2519 *
2520 * The second case is where the free/alloc/free occur in different
2521 * transactions. In this case, the thread freeing the extent the second time
2522 * can't mark the extent busy immediately because it is already tracked in a
2523 * transaction that may be committing. When the log commit for the existing
2524 * busy extent completes, the busy extent will be removed from the tree. If we
2525 * allow the second busy insert to continue using that busy extent structure,
2526 * it can be freed before this transaction is safely in the log. Hence our
2527 * only option in this case is to force the log to remove the existing busy
2528 * extent from the list before we insert the new one with the current
2529 * transaction ID.
2530 *
2531 * The problem we are trying to avoid in the free-alloc-free in separate
2532 * transactions is most easily described with a timeline:
2533 *
2534 * Thread 1 Thread 2 Thread 3 xfslogd
2535 * xact alloc
2536 * free X
2537 * mark busy
2538 * commit xact
2539 * free xact
2540 * xact alloc
2541 * alloc X
2542 * busy search
2543 * mark xact sync
2544 * commit xact
2545 * free xact
2546 * force log
2547 * checkpoint starts
2548 * ....
2549 * xact alloc
2550 * free X
2551 * mark busy
2552 * finds match
2553 * *** KABOOM! ***
2554 * ....
2555 * log IO completes
2556 * unbusy X
2557 * checkpoint completes
2558 *
2559 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2560 * the checkpoint completes, and the busy extent it matched will have been
2561 * removed from the tree when it is woken. Hence it can then continue safely.
2562 *
2563 * However, to ensure this matching process is robust, we need to use the
2564 * transaction ID for identifying transaction, as delayed logging results in
2565 * the busy extent and transaction lifecycles being different. i.e. the busy
2566 * extent is active for a lot longer than the transaction. Hence the
2567 * transaction structure can be freed and reallocated, then mark the same
2568 * extent busy again in the new transaction. In this case the new transaction
2569 * will have a different tid but can have the same address, and hence we need
2570 * to check against the tid.
2571 *
2572 * Future: for delayed logging, we could avoid the log force if the extent was
2573 * first freed in the current checkpoint sequence. This, however, requires the
2574 * ability to pin the current checkpoint in memory until this transaction
2575 * commits to ensure that both the original free and the current one combine
2576 * logically into the one checkpoint. If the checkpoint sequences are
2577 * different, however, we still need to wait on a log force.
2484 */ 2578 */
2485void 2579void
2486xfs_alloc_mark_busy(xfs_trans_t *tp, 2580xfs_alloc_busy_insert(
2487 xfs_agnumber_t agno, 2581 struct xfs_trans *tp,
2488 xfs_agblock_t bno, 2582 xfs_agnumber_t agno,
2489 xfs_extlen_t len) 2583 xfs_agblock_t bno,
2584 xfs_extlen_t len)
2490{ 2585{
2491 xfs_perag_busy_t *bsy; 2586 struct xfs_busy_extent *new;
2587 struct xfs_busy_extent *busyp;
2492 struct xfs_perag *pag; 2588 struct xfs_perag *pag;
2493 int n; 2589 struct rb_node **rbp;
2590 struct rb_node *parent;
2591 int match;
2494 2592
2495 pag = xfs_perag_get(tp->t_mountp, agno);
2496 spin_lock(&pag->pagb_lock);
2497 2593
2498 /* search pagb_list for an open slot */ 2594 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2499 for (bsy = pag->pagb_list, n = 0; 2595 if (!new) {
2500 n < XFS_PAGB_NUM_SLOTS; 2596 /*
2501 bsy++, n++) { 2597 * No Memory! Since it is now not possible to track the free
2502 if (bsy->busy_tp == NULL) { 2598 * block, make this a synchronous transaction to insure that
2503 break; 2599 * the block is not reused before this transaction commits.
2504 } 2600 */
2601 trace_xfs_alloc_busy(tp, agno, bno, len, 1);
2602 xfs_trans_set_sync(tp);
2603 return;
2505 } 2604 }
2506 2605
2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); 2606 new->agno = agno;
2607 new->bno = bno;
2608 new->length = len;
2609 new->tid = xfs_log_get_trans_ident(tp);
2508 2610
2509 if (n < XFS_PAGB_NUM_SLOTS) { 2611 INIT_LIST_HEAD(&new->list);
2510 bsy = &pag->pagb_list[n]; 2612
2511 pag->pagb_count++; 2613 /* trace before insert to be able to see failed inserts */
2512 bsy->busy_start = bno; 2614 trace_xfs_alloc_busy(tp, agno, bno, len, 0);
2513 bsy->busy_length = len; 2615
2514 bsy->busy_tp = tp; 2616 pag = xfs_perag_get(tp->t_mountp, new->agno);
2515 xfs_trans_add_busy(tp, agno, n); 2617restart:
2516 } else { 2618 spin_lock(&pag->pagb_lock);
2619 rbp = &pag->pagb_tree.rb_node;
2620 parent = NULL;
2621 busyp = NULL;
2622 match = 0;
2623 while (*rbp && match >= 0) {
2624 parent = *rbp;
2625 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2626
2627 if (new->bno < busyp->bno) {
2628 /* may overlap, but exact start block is lower */
2629 rbp = &(*rbp)->rb_left;
2630 if (new->bno + new->length > busyp->bno)
2631 match = busyp->tid == new->tid ? 1 : -1;
2632 } else if (new->bno > busyp->bno) {
2633 /* may overlap, but exact start block is higher */
2634 rbp = &(*rbp)->rb_right;
2635 if (bno < busyp->bno + busyp->length)
2636 match = busyp->tid == new->tid ? 1 : -1;
2637 } else {
2638 match = busyp->tid == new->tid ? 1 : -1;
2639 break;
2640 }
2641 }
2642 if (match < 0) {
2643 /* overlap marked busy in different transaction */
2644 spin_unlock(&pag->pagb_lock);
2645 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2646 goto restart;
2647 }
2648 if (match > 0) {
2517 /* 2649 /*
2518 * The busy list is full! Since it is now not possible to 2650 * overlap marked busy in same transaction. Update if exact
2519 * track the free block, make this a synchronous transaction 2651 * start block match, otherwise combine the busy extents into
2520 * to insure that the block is not reused before this 2652 * a single range.
2521 * transaction commits.
2522 */ 2653 */
2523 xfs_trans_set_sync(tp); 2654 if (busyp->bno == new->bno) {
2524 } 2655 busyp->length = max(busyp->length, new->length);
2656 spin_unlock(&pag->pagb_lock);
2657 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2658 xfs_perag_put(pag);
2659 kmem_free(new);
2660 return;
2661 }
2662 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2663 new->length = max(busyp->bno + busyp->length,
2664 new->bno + new->length) -
2665 min(busyp->bno, new->bno);
2666 new->bno = min(busyp->bno, new->bno);
2667 } else
2668 busyp = NULL;
2525 2669
2670 rb_link_node(&new->rb_node, parent, rbp);
2671 rb_insert_color(&new->rb_node, &pag->pagb_tree);
2672
2673 list_add(&new->list, &tp->t_busy);
2526 spin_unlock(&pag->pagb_lock); 2674 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag); 2675 xfs_perag_put(pag);
2676 kmem_free(busyp);
2528} 2677}
2529 2678
2530void 2679/*
2531xfs_alloc_clear_busy(xfs_trans_t *tp, 2680 * Search for a busy extent within the range of the extent we are about to
2532 xfs_agnumber_t agno, 2681 * allocate. You need to be holding the busy extent tree lock when calling
2533 int idx) 2682 * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
2683 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
2684 * match. This is done so that a non-zero return indicates an overlap that
2685 * will require a synchronous transaction, but it can still be
2686 * used to distinguish between a partial or exact match.
2687 */
2688static int
2689xfs_alloc_busy_search(
2690 struct xfs_mount *mp,
2691 xfs_agnumber_t agno,
2692 xfs_agblock_t bno,
2693 xfs_extlen_t len)
2534{ 2694{
2535 struct xfs_perag *pag; 2695 struct xfs_perag *pag;
2536 xfs_perag_busy_t *list; 2696 struct rb_node *rbp;
2697 struct xfs_busy_extent *busyp;
2698 int match = 0;
2537 2699
2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2700 pag = xfs_perag_get(mp, agno);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock); 2701 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2542 2702
2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); 2703 rbp = pag->pagb_tree.rb_node;
2544 2704
2545 if (list[idx].busy_tp == tp) { 2705 /* find closest start bno overlap */
2546 list[idx].busy_tp = NULL; 2706 while (rbp) {
2547 pag->pagb_count--; 2707 busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
2708 if (bno < busyp->bno) {
2709 /* may overlap, but exact start block is lower */
2710 if (bno + len > busyp->bno)
2711 match = -1;
2712 rbp = rbp->rb_left;
2713 } else if (bno > busyp->bno) {
2714 /* may overlap, but exact start block is higher */
2715 if (bno < busyp->bno + busyp->length)
2716 match = -1;
2717 rbp = rbp->rb_right;
2718 } else {
2719 /* bno matches busyp, length determines exact match */
2720 match = (busyp->length == len) ? 1 : -1;
2721 break;
2722 }
2548 } 2723 }
2549
2550 spin_unlock(&pag->pagb_lock); 2724 spin_unlock(&pag->pagb_lock);
2725 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2551 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
2727 return match;
2552} 2728}
2553 2729
2554 2730void
2555/* 2731xfs_alloc_busy_clear(
2556 * If we find the extent in the busy list, force the log out to get the 2732 struct xfs_mount *mp,
2557 * extent out of the busy list so the caller can use it straight away. 2733 struct xfs_busy_extent *busyp)
2558 */
2559STATIC void
2560xfs_alloc_search_busy(xfs_trans_t *tp,
2561 xfs_agnumber_t agno,
2562 xfs_agblock_t bno,
2563 xfs_extlen_t len)
2564{ 2734{
2565 struct xfs_perag *pag; 2735 struct xfs_perag *pag;
2566 xfs_perag_busy_t *bsy;
2567 xfs_agblock_t uend, bend;
2568 xfs_lsn_t lsn = 0;
2569 int cnt;
2570 2736
2571 pag = xfs_perag_get(tp->t_mountp, agno); 2737 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
2572 spin_lock(&pag->pagb_lock); 2738 busyp->length);
2573 cnt = pag->pagb_count;
2574 2739
2575 /* 2740 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
2576 * search pagb_list for this slot, skipping open slots. We have to 2741 busyp->length) == 1);
2577 * search the entire array as there may be multiple overlaps and
2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range.
2580 */
2581 uend = bno + len - 1;
2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2583 bsy = &pag->pagb_list[cnt];
2584 if (!bsy->busy_tp)
2585 continue;
2586 2742
2587 bend = bsy->busy_start + bsy->busy_length - 1; 2743 list_del_init(&busyp->list);
2588 if (bno > bend || uend < bsy->busy_start)
2589 continue;
2590 2744
2591 /* (start1,length1) within (start2, length2) */ 2745 pag = xfs_perag_get(mp, busyp->agno);
2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) 2746 spin_lock(&pag->pagb_lock);
2593 lsn = bsy->busy_tp->t_commit_lsn; 2747 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2594 }
2595 spin_unlock(&pag->pagb_lock); 2748 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag); 2749 xfs_perag_put(pag);
2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2598 2750
2599 /* 2751 kmem_free(busyp);
2600 * If a block was found, force the log through the LSN of the
2601 * transaction that freed the block
2602 */
2603 if (lsn)
2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2605} 2752}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
22struct xfs_mount; 22struct xfs_mount;
23struct xfs_perag; 23struct xfs_perag;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_busy_extent;
25 26
26/* 27/*
27 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 28 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
119#ifdef __KERNEL__ 120#ifdef __KERNEL__
120 121
121void 122void
122xfs_alloc_mark_busy(xfs_trans_t *tp, 123xfs_alloc_busy_insert(xfs_trans_t *tp,
123 xfs_agnumber_t agno, 124 xfs_agnumber_t agno,
124 xfs_agblock_t bno, 125 xfs_agblock_t bno,
125 xfs_extlen_t len); 126 xfs_extlen_t len);
126 127
127void 128void
128xfs_alloc_clear_busy(xfs_trans_t *tp, 129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
129 xfs_agnumber_t ag,
130 int idx);
131 130
132#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
133 132
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
134 * disk. If a busy block is allocated, the iclog is pushed up to the 134 * disk. If a busy block is allocated, the iclog is pushed up to the
135 * LSN that freed the block. 135 * LSN that freed the block.
136 */ 136 */
137 xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 137 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
138 xfs_trans_agbtree_delta(cur->bc_tp, -1); 138 xfs_trans_agbtree_delta(cur->bc_tp, -1);
139 return 0; 139 return 0;
140} 140}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 240340a4727b..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
64 nbytes = last - first + 1; 64 nbytes = last - first + 1;
65 bfset(bip->bli_logged, first, nbytes); 65 bfset(bip->bli_logged, first, nbytes);
66 for (x = 0; x < nbytes; x++) { 66 for (x = 0; x < nbytes; x++) {
67 chunk_num = byte >> XFS_BLI_SHIFT; 67 chunk_num = byte >> XFS_BLF_SHIFT;
68 word_num = chunk_num >> BIT_TO_WORD_SHIFT; 68 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
69 bit_num = chunk_num & (NBWORD - 1); 69 bit_num = chunk_num & (NBWORD - 1);
70 wordp = &(bip->bli_format.blf_data_map[word_num]); 70 wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
166 * cancel flag in it. 166 * cancel flag in it.
167 */ 167 */
168 trace_xfs_buf_item_size_stale(bip); 168 trace_xfs_buf_item_size_stale(bip);
169 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 169 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
170 return 1; 170 return 1;
171 } 171 }
172 172
@@ -197,9 +197,9 @@ xfs_buf_item_size(
197 } else if (next_bit != last_bit + 1) { 197 } else if (next_bit != last_bit + 1) {
198 last_bit = next_bit; 198 last_bit = next_bit;
199 nvecs++; 199 nvecs++;
200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != 200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
201 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + 201 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
202 XFS_BLI_CHUNK)) { 202 XFS_BLF_CHUNK)) {
203 last_bit = next_bit; 203 last_bit = next_bit;
204 nvecs++; 204 nvecs++;
205 } else { 205 } else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
257 /*
258 * If it is an inode buffer, transfer the in-memory state to the
259 * format flags and clear the in-memory state. We do not transfer
260 * this state if the inode buffer allocation has not yet been committed
261 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
262 * correct replay of the inode allocation.
263 */
264 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
265 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
266 xfs_log_item_in_current_chkpt(&bip->bli_item)))
267 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
268 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
269 }
270
257 if (bip->bli_flags & XFS_BLI_STALE) { 271 if (bip->bli_flags & XFS_BLI_STALE) {
258 /* 272 /*
259 * The buffer is stale, so all we need to log 273 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
261 * cancel flag in it. 275 * cancel flag in it.
262 */ 276 */
263 trace_xfs_buf_item_format_stale(bip); 277 trace_xfs_buf_item_format_stale(bip);
264 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 278 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
265 bip->bli_format.blf_size = nvecs; 279 bip->bli_format.blf_size = nvecs;
266 return; 280 return;
267 } 281 }
@@ -294,28 +308,28 @@ xfs_buf_item_format(
294 * keep counting and scanning. 308 * keep counting and scanning.
295 */ 309 */
296 if (next_bit == -1) { 310 if (next_bit == -1) {
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 311 buffer_offset = first_bit * XFS_BLF_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 312 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 313 vecp->i_len = nbits * XFS_BLF_CHUNK;
300 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 314 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 315 nvecs++;
302 break; 316 break;
303 } else if (next_bit != last_bit + 1) { 317 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 318 buffer_offset = first_bit * XFS_BLF_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 319 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 320 vecp->i_len = nbits * XFS_BLF_CHUNK;
307 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 321 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 322 nvecs++;
309 vecp++; 323 vecp++;
310 first_bit = next_bit; 324 first_bit = next_bit;
311 last_bit = next_bit; 325 last_bit = next_bit;
312 nbits = 1; 326 nbits = 1;
313 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != 327 } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
314 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + 328 (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
315 XFS_BLI_CHUNK)) { 329 XFS_BLF_CHUNK)) {
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 330 buffer_offset = first_bit * XFS_BLF_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 331 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 332 vecp->i_len = nbits * XFS_BLF_CHUNK;
319 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 333 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 334/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 335 * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
341} 355}
342 356
343/* 357/*
344 * This is called to pin the buffer associated with the buf log 358 * This is called to pin the buffer associated with the buf log item in memory
345 * item in memory so it cannot be written out. Simply call bpin() 359 * so it cannot be written out. Simply call bpin() on the buffer to do this.
346 * on the buffer to do this. 360 *
361 * We also always take a reference to the buffer log item here so that the bli
362 * is held while the item is pinned in memory. This means that we can
363 * unconditionally drop the reference count a transaction holds when the
364 * transaction is completed.
347 */ 365 */
366
348STATIC void 367STATIC void
349xfs_buf_item_pin( 368xfs_buf_item_pin(
350 xfs_buf_log_item_t *bip) 369 xfs_buf_log_item_t *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
356 ASSERT(atomic_read(&bip->bli_refcount) > 0); 375 ASSERT(atomic_read(&bip->bli_refcount) > 0);
357 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 376 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
358 (bip->bli_flags & XFS_BLI_STALE)); 377 (bip->bli_flags & XFS_BLI_STALE));
378 atomic_inc(&bip->bli_refcount);
359 trace_xfs_buf_item_pin(bip); 379 trace_xfs_buf_item_pin(bip);
360 xfs_bpin(bp); 380 xfs_bpin(bp);
361} 381}
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
393 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 413 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
394 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 414 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
395 ASSERT(XFS_BUF_ISSTALE(bp)); 415 ASSERT(XFS_BUF_ISSTALE(bp));
396 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 416 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
397 trace_xfs_buf_item_unpin_stale(bip); 417 trace_xfs_buf_item_unpin_stale(bip);
398 418
399 /* 419 /*
@@ -489,20 +509,23 @@ xfs_buf_item_trylock(
489} 509}
490 510
491/* 511/*
492 * Release the buffer associated with the buf log item. 512 * Release the buffer associated with the buf log item. If there is no dirty
493 * If there is no dirty logged data associated with the 513 * logged data associated with the buffer recorded in the buf log item, then
494 * buffer recorded in the buf log item, then free the 514 * free the buf log item and remove the reference to it in the buffer.
495 * buf log item and remove the reference to it in the 515 *
496 * buffer. 516 * This call ignores the recursion count. It is only called when the buffer
517 * should REALLY be unlocked, regardless of the recursion count.
497 * 518 *
498 * This call ignores the recursion count. It is only called 519 * We unconditionally drop the transaction's reference to the log item. If the
499 * when the buffer should REALLY be unlocked, regardless 520 * item was logged, then another reference was taken when it was pinned, so we
500 * of the recursion count. 521 * can safely drop the transaction reference now. This also allows us to avoid
522 * potential races with the unpin code freeing the bli by not referencing the
523 * bli after we've dropped the reference count.
501 * 524 *
502 * If the XFS_BLI_HOLD flag is set in the buf log item, then 525 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
503 * free the log item if necessary but do not unlock the buffer. 526 * if necessary but do not unlock the buffer. This is for support of
504 * This is for support of xfs_trans_bhold(). Make sure the 527 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
505 * XFS_BLI_HOLD field is cleared if we don't free the item. 528 * free the item.
506 */ 529 */
507STATIC void 530STATIC void
508xfs_buf_item_unlock( 531xfs_buf_item_unlock(
@@ -514,73 +537,54 @@ xfs_buf_item_unlock(
514 537
515 bp = bip->bli_buf; 538 bp = bip->bli_buf;
516 539
517 /* 540 /* Clear the buffer's association with this transaction. */
518 * Clear the buffer's association with this transaction.
519 */
520 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 541 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
521 542
522 /* 543 /*
523 * If this is a transaction abort, don't return early. 544 * If this is a transaction abort, don't return early. Instead, allow
524 * Instead, allow the brelse to happen. 545 * the brelse to happen. Normally it would be done for stale
525 * Normally it would be done for stale (cancelled) buffers 546 * (cancelled) buffers at unpin time, but we'll never go through the
526 * at unpin time, but we'll never go through the pin/unpin 547 * pin/unpin cycle if we abort inside commit.
527 * cycle if we abort inside commit.
528 */ 548 */
529 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; 549 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
530 550
531 /* 551 /*
532 * If the buf item is marked stale, then don't do anything. 552 * Before possibly freeing the buf item, determine if we should
533 * We'll unlock the buffer and free the buf item when the 553 * release the buffer at the end of this routine.
534 * buffer is unpinned for the last time.
535 */ 554 */
536 if (bip->bli_flags & XFS_BLI_STALE) { 555 hold = bip->bli_flags & XFS_BLI_HOLD;
537 bip->bli_flags &= ~XFS_BLI_LOGGED; 556
538 trace_xfs_buf_item_unlock_stale(bip); 557 /* Clear the per transaction state. */
539 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 558 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
540 if (!aborted)
541 return;
542 }
543 559
544 /* 560 /*
545 * Drop the transaction's reference to the log item if 561 * If the buf item is marked stale, then don't do anything. We'll
546 * it was not logged as part of the transaction. Otherwise 562 * unlock the buffer and free the buf item when the buffer is unpinned
547 * we'll drop the reference in xfs_buf_item_unpin() when 563 * for the last time.
548 * the transaction is really through with the buffer.
549 */ 564 */
550 if (!(bip->bli_flags & XFS_BLI_LOGGED)) { 565 if (bip->bli_flags & XFS_BLI_STALE) {
551 atomic_dec(&bip->bli_refcount); 566 trace_xfs_buf_item_unlock_stale(bip);
552 } else { 567 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
553 /* 568 if (!aborted) {
554 * Clear the logged flag since this is per 569 atomic_dec(&bip->bli_refcount);
555 * transaction state. 570 return;
556 */ 571 }
557 bip->bli_flags &= ~XFS_BLI_LOGGED;
558 } 572 }
559 573
560 /*
561 * Before possibly freeing the buf item, determine if we should
562 * release the buffer at the end of this routine.
563 */
564 hold = bip->bli_flags & XFS_BLI_HOLD;
565 trace_xfs_buf_item_unlock(bip); 574 trace_xfs_buf_item_unlock(bip);
566 575
567 /* 576 /*
568 * If the buf item isn't tracking any data, free it. 577 * If the buf item isn't tracking any data, free it, otherwise drop the
569 * Otherwise, if XFS_BLI_HOLD is set clear it. 578 * reference we hold to it.
570 */ 579 */
571 if (xfs_bitmap_empty(bip->bli_format.blf_data_map, 580 if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
572 bip->bli_format.blf_map_size)) { 581 bip->bli_format.blf_map_size))
573 xfs_buf_item_relse(bp); 582 xfs_buf_item_relse(bp);
574 } else if (hold) { 583 else
575 bip->bli_flags &= ~XFS_BLI_HOLD; 584 atomic_dec(&bip->bli_refcount);
576 }
577 585
578 /* 586 if (!hold)
579 * Release the buffer if XFS_BLI_HOLD was not set.
580 */
581 if (!hold) {
582 xfs_buf_relse(bp); 587 xfs_buf_relse(bp);
583 }
584} 588}
585 589
586/* 590/*
@@ -717,12 +721,12 @@ xfs_buf_item_init(
717 } 721 }
718 722
719 /* 723 /*
720 * chunks is the number of XFS_BLI_CHUNK size pieces 724 * chunks is the number of XFS_BLF_CHUNK size pieces
721 * the buffer can be divided into. Make sure not to 725 * the buffer can be divided into. Make sure not to
722 * truncate any pieces. map_size is the size of the 726 * truncate any pieces. map_size is the size of the
723 * bitmap needed to describe the chunks of the buffer. 727 * bitmap needed to describe the chunks of the buffer.
724 */ 728 */
725 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); 729 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
726 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); 730 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
727 731
728 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 732 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -790,8 +794,8 @@ xfs_buf_item_log(
790 /* 794 /*
791 * Convert byte offsets to bit numbers. 795 * Convert byte offsets to bit numbers.
792 */ 796 */
793 first_bit = first >> XFS_BLI_SHIFT; 797 first_bit = first >> XFS_BLF_SHIFT;
794 last_bit = last >> XFS_BLI_SHIFT; 798 last_bit = last >> XFS_BLF_SHIFT;
795 799
796 /* 800 /*
797 * Calculate the total number of bits to be set. 801 * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index df4454511f73..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format {
41 * This flag indicates that the buffer contains on disk inodes 41 * This flag indicates that the buffer contains on disk inodes
42 * and requires special recovery handling. 42 * and requires special recovery handling.
43 */ 43 */
44#define XFS_BLI_INODE_BUF 0x1 44#define XFS_BLF_INODE_BUF 0x1
45/* 45/*
46 * This flag indicates that the buffer should not be replayed 46 * This flag indicates that the buffer should not be replayed
47 * during recovery because its blocks are being freed. 47 * during recovery because its blocks are being freed.
48 */ 48 */
49#define XFS_BLI_CANCEL 0x2 49#define XFS_BLF_CANCEL 0x2
50/* 50/*
51 * This flag indicates that the buffer contains on disk 51 * This flag indicates that the buffer contains on disk
52 * user or group dquots and may require special recovery handling. 52 * user or group dquots and may require special recovery handling.
53 */ 53 */
54#define XFS_BLI_UDQUOT_BUF 0x4 54#define XFS_BLF_UDQUOT_BUF 0x4
55#define XFS_BLI_PDQUOT_BUF 0x8 55#define XFS_BLF_PDQUOT_BUF 0x8
56#define XFS_BLI_GDQUOT_BUF 0x10 56#define XFS_BLF_GDQUOT_BUF 0x10
57 57
58#define XFS_BLI_CHUNK 128 58#define XFS_BLF_CHUNK 128
59#define XFS_BLI_SHIFT 7 59#define XFS_BLF_SHIFT 7
60#define BIT_TO_WORD_SHIFT 5 60#define BIT_TO_WORD_SHIFT 5
61#define NBWORD (NBBY * sizeof(unsigned int)) 61#define NBWORD (NBBY * sizeof(unsigned int))
62 62
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format {
69#define XFS_BLI_LOGGED 0x08 69#define XFS_BLI_LOGGED 0x08
70#define XFS_BLI_INODE_ALLOC_BUF 0x10 70#define XFS_BLI_INODE_ALLOC_BUF 0x10
71#define XFS_BLI_STALE_INODE 0x20 71#define XFS_BLI_STALE_INODE 0x20
72#define XFS_BLI_INODE_BUF 0x40
72 73
73#define XFS_BLI_FLAGS \ 74#define XFS_BLI_FLAGS \
74 { XFS_BLI_HOLD, "HOLD" }, \ 75 { XFS_BLI_HOLD, "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format {
76 { XFS_BLI_STALE, "STALE" }, \ 77 { XFS_BLI_STALE, "STALE" }, \
77 { XFS_BLI_LOGGED, "LOGGED" }, \ 78 { XFS_BLI_LOGGED, "LOGGED" }, \
78 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 79 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79 { XFS_BLI_STALE_INODE, "STALE_INODE" } 80 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
81 { XFS_BLI_INODE_BUF, "INODE_BUF" }
80 82
81 83
82#ifdef __KERNEL__ 84#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ef96175c0744..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
170 va_list ap; 170 va_list ap;
171 171
172#ifdef DEBUG 172#ifdef DEBUG
173 xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; 173 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
174#endif 174#endif
175 175
176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag) 176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3038dd52c72a..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -54,9 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
56STATIC void xlog_dealloc_log(xlog_t *log); 56STATIC void xlog_dealloc_log(xlog_t *log);
57STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
58 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
59 xlog_in_core_t **commit_iclog, uint flags);
60 57
61/* local state machine functions */ 58/* local state machine functions */
62STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 59STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log,
86STATIC void xlog_ungrant_log_space(xlog_t *log, 83STATIC void xlog_ungrant_log_space(xlog_t *log,
87 xlog_ticket_t *ticket); 84 xlog_ticket_t *ticket);
88 85
89
90/* local ticket functions */
91STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
92 int unit_bytes,
93 int count,
94 char clientid,
95 uint flags);
96
97#if defined(DEBUG) 86#if defined(DEBUG)
98STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 87STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
99STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 88STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
@@ -360,6 +349,15 @@ xfs_log_reserve(
360 ASSERT(flags & XFS_LOG_PERM_RESERV); 349 ASSERT(flags & XFS_LOG_PERM_RESERV);
361 internal_ticket = *ticket; 350 internal_ticket = *ticket;
362 351
352 /*
353 * this is a new transaction on the ticket, so we need to
354 * change the transaction ID so that the next transaction has a
355 * different TID in the log. Just add one to the existing tid
356 * so that we can see chains of rolling transactions in the log
357 * easily.
358 */
359 internal_ticket->t_tid++;
360
363 trace_xfs_log_reserve(log, internal_ticket); 361 trace_xfs_log_reserve(log, internal_ticket);
364 362
365 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 363 xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -367,7 +365,8 @@ xfs_log_reserve(
367 } else { 365 } else {
368 /* may sleep if need to allocate more tickets */ 366 /* may sleep if need to allocate more tickets */
369 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, 367 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
370 client, flags); 368 client, flags,
369 KM_SLEEP|KM_MAYFAIL);
371 if (!internal_ticket) 370 if (!internal_ticket)
372 return XFS_ERROR(ENOMEM); 371 return XFS_ERROR(ENOMEM);
373 internal_ticket->t_trans_type = t_type; 372 internal_ticket->t_trans_type = t_type;
@@ -452,6 +451,13 @@ xfs_log_mount(
452 /* Normal transactions can now occur */ 451 /* Normal transactions can now occur */
453 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 452 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
454 453
454 /*
455 * Now the log has been fully initialised and we know were our
456 * space grant counters are, we can initialise the permanent ticket
457 * needed for delayed logging to work.
458 */
459 xlog_cil_init_post_recovery(mp->m_log);
460
455 return 0; 461 return 0;
456 462
457out_destroy_ail: 463out_destroy_ail:
@@ -658,6 +664,10 @@ xfs_log_item_init(
658 item->li_ailp = mp->m_ail; 664 item->li_ailp = mp->m_ail;
659 item->li_type = type; 665 item->li_type = type;
660 item->li_ops = ops; 666 item->li_ops = ops;
667 item->li_lv = NULL;
668
669 INIT_LIST_HEAD(&item->li_ail);
670 INIT_LIST_HEAD(&item->li_cil);
661} 671}
662 672
663/* 673/*
@@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1168 *iclogp = log->l_iclog; /* complete ring */ 1178 *iclogp = log->l_iclog; /* complete ring */
1169 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1179 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1170 1180
1181 error = xlog_cil_init(log);
1182 if (error)
1183 goto out_free_iclog;
1171 return log; 1184 return log;
1172 1185
1173out_free_iclog: 1186out_free_iclog:
@@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
1494 xlog_in_core_t *iclog, *next_iclog; 1507 xlog_in_core_t *iclog, *next_iclog;
1495 int i; 1508 int i;
1496 1509
1510 xlog_cil_destroy(log);
1511
1497 iclog = log->l_iclog; 1512 iclog = log->l_iclog;
1498 for (i=0; i<log->l_iclog_bufs; i++) { 1513 for (i=0; i<log->l_iclog_bufs; i++) {
1499 sv_destroy(&iclog->ic_force_wait); 1514 sv_destroy(&iclog->ic_force_wait);
@@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log,
1536 * print out info relating to regions written which consume 1551 * print out info relating to regions written which consume
1537 * the reservation 1552 * the reservation
1538 */ 1553 */
1539STATIC void 1554void
1540xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1555xlog_print_tic_res(
1556 struct xfs_mount *mp,
1557 struct xlog_ticket *ticket)
1541{ 1558{
1542 uint i; 1559 uint i;
1543 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1560 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1637 "bad-rtype" : res_type_str[r_type-1]), 1654 "bad-rtype" : res_type_str[r_type-1]),
1638 ticket->t_res_arr[i].r_len); 1655 ticket->t_res_arr[i].r_len);
1639 } 1656 }
1657
1658 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1659 "xfs_log_write: reservation ran out. Need to up reservation");
1660 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1640} 1661}
1641 1662
1642/* 1663/*
@@ -1865,7 +1886,7 @@ xlog_write_copy_finish(
1865 * we don't update ic_offset until the end when we know exactly how many 1886 * we don't update ic_offset until the end when we know exactly how many
1866 * bytes have been written out. 1887 * bytes have been written out.
1867 */ 1888 */
1868STATIC int 1889int
1869xlog_write( 1890xlog_write(
1870 struct log *log, 1891 struct log *log,
1871 struct xfs_log_vec *log_vector, 1892 struct xfs_log_vec *log_vector,
@@ -1889,22 +1910,26 @@ xlog_write(
1889 *start_lsn = 0; 1910 *start_lsn = 0;
1890 1911
1891 len = xlog_write_calc_vec_length(ticket, log_vector); 1912 len = xlog_write_calc_vec_length(ticket, log_vector);
1892 if (ticket->t_curr_res < len) { 1913 if (log->l_cilp) {
1893 xlog_print_tic_res(log->l_mp, ticket); 1914 /*
1894#ifdef DEBUG 1915 * Region headers and bytes are already accounted for.
1895 xlog_panic( 1916 * We only need to take into account start records and
1896 "xfs_log_write: reservation ran out. Need to up reservation"); 1917 * split regions in this function.
1897#else 1918 */
1898 /* Customer configurable panic */ 1919 if (ticket->t_flags & XLOG_TIC_INITED)
1899 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp, 1920 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1900 "xfs_log_write: reservation ran out. Need to up reservation");
1901 1921
1902 /* If we did not panic, shutdown the filesystem */ 1922 /*
1903 xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE); 1923 * Commit record headers need to be accounted for. These
1904#endif 1924 * come in as separate writes so are easy to detect.
1905 } 1925 */
1926 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
1927 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1928 } else
1929 ticket->t_curr_res -= len;
1906 1930
1907 ticket->t_curr_res -= len; 1931 if (ticket->t_curr_res < 0)
1932 xlog_print_tic_res(log->l_mp, ticket);
1908 1933
1909 index = 0; 1934 index = 0;
1910 lv = log_vector; 1935 lv = log_vector;
@@ -3000,6 +3025,8 @@ _xfs_log_force(
3000 3025
3001 XFS_STATS_INC(xs_log_force); 3026 XFS_STATS_INC(xs_log_force);
3002 3027
3028 xlog_cil_push(log, 1);
3029
3003 spin_lock(&log->l_icloglock); 3030 spin_lock(&log->l_icloglock);
3004 3031
3005 iclog = log->l_iclog; 3032 iclog = log->l_iclog;
@@ -3149,6 +3176,12 @@ _xfs_log_force_lsn(
3149 3176
3150 XFS_STATS_INC(xs_log_force); 3177 XFS_STATS_INC(xs_log_force);
3151 3178
3179 if (log->l_cilp) {
3180 lsn = xlog_cil_push_lsn(log, lsn);
3181 if (lsn == NULLCOMMITLSN)
3182 return 0;
3183 }
3184
3152try_again: 3185try_again:
3153 spin_lock(&log->l_icloglock); 3186 spin_lock(&log->l_icloglock);
3154 iclog = log->l_iclog; 3187 iclog = log->l_iclog;
@@ -3313,22 +3346,30 @@ xfs_log_ticket_get(
3313 return ticket; 3346 return ticket;
3314} 3347}
3315 3348
3349xlog_tid_t
3350xfs_log_get_trans_ident(
3351 struct xfs_trans *tp)
3352{
3353 return tp->t_ticket->t_tid;
3354}
3355
3316/* 3356/*
3317 * Allocate and initialise a new log ticket. 3357 * Allocate and initialise a new log ticket.
3318 */ 3358 */
3319STATIC xlog_ticket_t * 3359xlog_ticket_t *
3320xlog_ticket_alloc( 3360xlog_ticket_alloc(
3321 struct log *log, 3361 struct log *log,
3322 int unit_bytes, 3362 int unit_bytes,
3323 int cnt, 3363 int cnt,
3324 char client, 3364 char client,
3325 uint xflags) 3365 uint xflags,
3366 int alloc_flags)
3326{ 3367{
3327 struct xlog_ticket *tic; 3368 struct xlog_ticket *tic;
3328 uint num_headers; 3369 uint num_headers;
3329 int iclog_space; 3370 int iclog_space;
3330 3371
3331 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); 3372 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3332 if (!tic) 3373 if (!tic)
3333 return NULL; 3374 return NULL;
3334 3375
@@ -3647,6 +3688,11 @@ xlog_state_ioerror(
3647 * c. nothing new gets queued up after (a) and (b) are done. 3688 * c. nothing new gets queued up after (a) and (b) are done.
3648 * d. if !logerror, flush the iclogs to disk, then seal them off 3689 * d. if !logerror, flush the iclogs to disk, then seal them off
3649 * for business. 3690 * for business.
3691 *
3692 * Note: for delayed logging the !logerror case needs to flush the regions
3693 * held in memory out to the iclogs before flushing them to disk. This needs
3694 * to be done before the log is marked as shutdown, otherwise the flush to the
3695 * iclogs will fail.
3650 */ 3696 */
3651int 3697int
3652xfs_log_force_umount( 3698xfs_log_force_umount(
@@ -3680,6 +3726,16 @@ xfs_log_force_umount(
3680 return 1; 3726 return 1;
3681 } 3727 }
3682 retval = 0; 3728 retval = 0;
3729
3730 /*
3731 * Flush the in memory commit item list before marking the log as
3732 * being shut down. We need to do it in this order to ensure all the
3733 * completed transactions are flushed to disk with the xfs_log_force()
3734 * call below.
3735 */
3736 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3737 xlog_cil_push(log, 1);
3738
3683 /* 3739 /*
3684 * We must hold both the GRANT lock and the LOG lock, 3740 * We must hold both the GRANT lock and the LOG lock,
3685 * before we mark the filesystem SHUTDOWN and wake 3741 * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 229d1f36ba9a..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
19#define __XFS_LOG_H__ 19#define __XFS_LOG_H__
20 20
21/* get lsn fields */ 21/* get lsn fields */
22
23#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) 22#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
24#define BLOCK_LSN(lsn) ((uint)(lsn)) 23#define BLOCK_LSN(lsn) ((uint)(lsn))
25 24
@@ -114,6 +113,9 @@ struct xfs_log_vec {
114 struct xfs_log_vec *lv_next; /* next lv in build list */ 113 struct xfs_log_vec *lv_next; /* next lv in build list */
115 int lv_niovecs; /* number of iovecs in lv */ 114 int lv_niovecs; /* number of iovecs in lv */
116 struct xfs_log_iovec *lv_iovecp; /* iovec array */ 115 struct xfs_log_iovec *lv_iovecp; /* iovec array */
116 struct xfs_log_item *lv_item; /* owner */
117 char *lv_buf; /* formatted buffer */
118 int lv_buf_len; /* size of formatted buffer */
117}; 119};
118 120
119/* 121/*
@@ -134,6 +136,7 @@ struct xlog_in_core;
134struct xlog_ticket; 136struct xlog_ticket;
135struct xfs_log_item; 137struct xfs_log_item;
136struct xfs_item_ops; 138struct xfs_item_ops;
139struct xfs_trans;
137 140
138void xfs_log_item_init(struct xfs_mount *mp, 141void xfs_log_item_init(struct xfs_mount *mp,
139 struct xfs_log_item *item, 142 struct xfs_log_item *item,
@@ -187,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp);
187 190
188void xlog_iodone(struct xfs_buf *); 191void xlog_iodone(struct xfs_buf *);
189 192
190struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); 193struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
191void xfs_log_ticket_put(struct xlog_ticket *ticket); 194void xfs_log_ticket_put(struct xlog_ticket *ticket);
192 195
196xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
197
198int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
199 struct xfs_log_vec *log_vector,
200 xfs_lsn_t *commit_lsn, int flags);
201bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
202
193#endif 203#endif
194 204
195 205
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
1/*
2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_log_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h"
32#include "xfs_error.h"
33#include "xfs_alloc.h"
34
35/*
36 * Perform initial CIL structure initialisation. If the CIL is not
37 * enabled in this filesystem, ensure the log->l_cilp is null so
38 * we can check this conditional to determine if we are doing delayed
39 * logging or not.
40 */
41int
42xlog_cil_init(
43 struct log *log)
44{
45 struct xfs_cil *cil;
46 struct xfs_cil_ctx *ctx;
47
48 log->l_cilp = NULL;
49 if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
50 return 0;
51
52 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
53 if (!cil)
54 return ENOMEM;
55
56 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
57 if (!ctx) {
58 kmem_free(cil);
59 return ENOMEM;
60 }
61
62 INIT_LIST_HEAD(&cil->xc_cil);
63 INIT_LIST_HEAD(&cil->xc_committing);
64 spin_lock_init(&cil->xc_cil_lock);
65 init_rwsem(&cil->xc_ctx_lock);
66 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
67
68 INIT_LIST_HEAD(&ctx->committing);
69 INIT_LIST_HEAD(&ctx->busy_extents);
70 ctx->sequence = 1;
71 ctx->cil = cil;
72 cil->xc_ctx = ctx;
73
74 cil->xc_log = log;
75 log->l_cilp = cil;
76 return 0;
77}
78
79void
80xlog_cil_destroy(
81 struct log *log)
82{
83 if (!log->l_cilp)
84 return;
85
86 if (log->l_cilp->xc_ctx) {
87 if (log->l_cilp->xc_ctx->ticket)
88 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
89 kmem_free(log->l_cilp->xc_ctx);
90 }
91
92 ASSERT(list_empty(&log->l_cilp->xc_cil));
93 kmem_free(log->l_cilp);
94}
95
96/*
97 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
98 * recover, so we don't allow failure here. Also, we allocate in a context that
99 * we don't want to be issuing transactions from, so we need to tell the
100 * allocation code this as well.
101 *
102 * We don't reserve any space for the ticket - we are going to steal whatever
103 * space we require from transactions as they commit. To ensure we reserve all
104 * the space required, we need to set the current reservation of the ticket to
105 * zero so that we know to steal the initial transaction overhead from the
106 * first transaction commit.
107 */
108static struct xlog_ticket *
109xlog_cil_ticket_alloc(
110 struct log *log)
111{
112 struct xlog_ticket *tic;
113
114 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
115 KM_SLEEP|KM_NOFS);
116 tic->t_trans_type = XFS_TRANS_CHECKPOINT;
117
118 /*
119 * set the current reservation to zero so we know to steal the basic
120 * transaction overhead reservation from the first transaction commit.
121 */
122 tic->t_curr_res = 0;
123 return tic;
124}
125
126/*
127 * After the first stage of log recovery is done, we know where the head and
128 * tail of the log are. We need this log initialisation done before we can
129 * initialise the first CIL checkpoint context.
130 *
131 * Here we allocate a log ticket to track space usage during a CIL push. This
132 * ticket is passed to xlog_write() directly so that we don't slowly leak log
133 * space by failing to account for space used by log headers and additional
134 * region headers for split regions.
135 */
136void
137xlog_cil_init_post_recovery(
138 struct log *log)
139{
140 if (!log->l_cilp)
141 return;
142
143 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
144 log->l_cilp->xc_ctx->sequence = 1;
145 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
146 log->l_curr_block);
147}
148
149/*
150 * Insert the log item into the CIL and calculate the difference in space
151 * consumed by the item. Add the space to the checkpoint ticket and calculate
152 * if the change requires additional log metadata. If it does, take that space
153 * as well. Remove the amount of space we addded to the checkpoint ticket from
154 * the current transaction ticket so that the accounting works out correctly.
155 *
156 * If this is the first time the item is being placed into the CIL in this
157 * context, pin it so it can't be written to disk until the CIL is flushed to
158 * the iclog and the iclog written to disk.
159 */
160static void
161xlog_cil_insert(
162 struct log *log,
163 struct xlog_ticket *ticket,
164 struct xfs_log_item *item,
165 struct xfs_log_vec *lv)
166{
167 struct xfs_cil *cil = log->l_cilp;
168 struct xfs_log_vec *old = lv->lv_item->li_lv;
169 struct xfs_cil_ctx *ctx = cil->xc_ctx;
170 int len;
171 int diff_iovecs;
172 int iclog_space;
173
174 if (old) {
175 /* existing lv on log item, space used is a delta */
176 ASSERT(!list_empty(&item->li_cil));
177 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
178
179 len = lv->lv_buf_len - old->lv_buf_len;
180 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
181 kmem_free(old->lv_buf);
182 kmem_free(old);
183 } else {
184 /* new lv, must pin the log item */
185 ASSERT(!lv->lv_item->li_lv);
186 ASSERT(list_empty(&item->li_cil));
187
188 len = lv->lv_buf_len;
189 diff_iovecs = lv->lv_niovecs;
190 IOP_PIN(lv->lv_item);
191
192 }
193 len += diff_iovecs * sizeof(xlog_op_header_t);
194
195 /* attach new log vector to log item */
196 lv->lv_item->li_lv = lv;
197
198 spin_lock(&cil->xc_cil_lock);
199 list_move_tail(&item->li_cil, &cil->xc_cil);
200 ctx->nvecs += diff_iovecs;
201
202 /*
203 * If this is the first time the item is being committed to the CIL,
204 * store the sequence number on the log item so we can tell
205 * in future commits whether this is the first checkpoint the item is
206 * being committed into.
207 */
208 if (!item->li_seq)
209 item->li_seq = ctx->sequence;
210
211 /*
212 * Now transfer enough transaction reservation to the context ticket
213 * for the checkpoint. The context ticket is special - the unit
214 * reservation has to grow as well as the current reservation as we
215 * steal from tickets so we can correctly determine the space used
216 * during the transaction commit.
217 */
218 if (ctx->ticket->t_curr_res == 0) {
219 /* first commit in checkpoint, steal the header reservation */
220 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
221 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
222 ticket->t_curr_res -= ctx->ticket->t_unit_res;
223 }
224
225 /* do we need space for more log record headers? */
226 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
227 if (len > 0 && (ctx->space_used / iclog_space !=
228 (ctx->space_used + len) / iclog_space)) {
229 int hdrs;
230
231 hdrs = (len + iclog_space - 1) / iclog_space;
232 /* need to take into account split region headers, too */
233 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
234 ctx->ticket->t_unit_res += hdrs;
235 ctx->ticket->t_curr_res += hdrs;
236 ticket->t_curr_res -= hdrs;
237 ASSERT(ticket->t_curr_res >= len);
238 }
239 ticket->t_curr_res -= len;
240 ctx->space_used += len;
241
242 spin_unlock(&cil->xc_cil_lock);
243}
244
245/*
246 * Format log item into a flat buffers
247 *
248 * For delayed logging, we need to hold a formatted buffer containing all the
249 * changes on the log item. This enables us to relog the item in memory and
250 * write it out asynchronously without needing to relock the object that was
251 * modified at the time it gets written into the iclog.
252 *
253 * This function builds a vector for the changes in each log item in the
254 * transaction. It then works out the length of the buffer needed for each log
255 * item, allocates them and formats the vector for the item into the buffer.
256 * The buffer is then attached to the log item are then inserted into the
257 * Committed Item List for tracking until the next checkpoint is written out.
258 *
259 * We don't set up region headers during this process; we simply copy the
260 * regions into the flat buffer. We can do this because we still have to do a
261 * formatting step to write the regions into the iclog buffer. Writing the
262 * ophdrs during the iclog write means that we can support splitting large
263 * regions across iclog boundares without needing a change in the format of the
264 * item/region encapsulation.
265 *
266 * Hence what we need to do now is change the rewrite the vector array to point
267 * to the copied region inside the buffer we just allocated. This allows us to
268 * format the regions into the iclog as though they are being formatted
269 * directly out of the objects themselves.
270 */
271static void
272xlog_cil_format_items(
273 struct log *log,
274 struct xfs_log_vec *log_vector,
275 struct xlog_ticket *ticket,
276 xfs_lsn_t *start_lsn)
277{
278 struct xfs_log_vec *lv;
279
280 if (start_lsn)
281 *start_lsn = log->l_cilp->xc_ctx->sequence;
282
283 ASSERT(log_vector);
284 for (lv = log_vector; lv; lv = lv->lv_next) {
285 void *ptr;
286 int index;
287 int len = 0;
288
289 /* build the vector array and calculate it's length */
290 IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
291 for (index = 0; index < lv->lv_niovecs; index++)
292 len += lv->lv_iovecp[index].i_len;
293
294 lv->lv_buf_len = len;
295 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
296 ptr = lv->lv_buf;
297
298 for (index = 0; index < lv->lv_niovecs; index++) {
299 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
300
301 memcpy(ptr, vec->i_addr, vec->i_len);
302 vec->i_addr = ptr;
303 ptr += vec->i_len;
304 }
305 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
306
307 xlog_cil_insert(log, ticket, lv->lv_item, lv);
308 }
309}
310
311static void
312xlog_cil_free_logvec(
313 struct xfs_log_vec *log_vector)
314{
315 struct xfs_log_vec *lv;
316
317 for (lv = log_vector; lv; ) {
318 struct xfs_log_vec *next = lv->lv_next;
319 kmem_free(lv->lv_buf);
320 kmem_free(lv);
321 lv = next;
322 }
323}
324
325/*
326 * Commit a transaction with the given vector to the Committed Item List.
327 *
328 * To do this, we need to format the item, pin it in memory if required and
329 * account for the space used by the transaction. Once we have done that we
330 * need to release the unused reservation for the transaction, attach the
331 * transaction to the checkpoint context so we carry the busy extents through
332 * to checkpoint completion, and then unlock all the items in the transaction.
333 *
334 * For more specific information about the order of operations in
335 * xfs_log_commit_cil() please refer to the comments in
336 * xfs_trans_commit_iclog().
337 *
338 * Called with the context lock already held in read mode to lock out
339 * background commit, returns without it held once background commits are
340 * allowed again.
341 */
342int
343xfs_log_commit_cil(
344 struct xfs_mount *mp,
345 struct xfs_trans *tp,
346 struct xfs_log_vec *log_vector,
347 xfs_lsn_t *commit_lsn,
348 int flags)
349{
350 struct log *log = mp->m_log;
351 int log_flags = 0;
352 int push = 0;
353
354 if (flags & XFS_TRANS_RELEASE_LOG_RES)
355 log_flags = XFS_LOG_REL_PERM_RESERV;
356
357 if (XLOG_FORCED_SHUTDOWN(log)) {
358 xlog_cil_free_logvec(log_vector);
359 return XFS_ERROR(EIO);
360 }
361
362 /* lock out background commit */
363 down_read(&log->l_cilp->xc_ctx_lock);
364 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
365
366 /* check we didn't blow the reservation */
367 if (tp->t_ticket->t_curr_res < 0)
368 xlog_print_tic_res(log->l_mp, tp->t_ticket);
369
370 /* attach the transaction to the CIL if it has any busy extents */
371 if (!list_empty(&tp->t_busy)) {
372 spin_lock(&log->l_cilp->xc_cil_lock);
373 list_splice_init(&tp->t_busy,
374 &log->l_cilp->xc_ctx->busy_extents);
375 spin_unlock(&log->l_cilp->xc_cil_lock);
376 }
377
378 tp->t_commit_lsn = *commit_lsn;
379 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
380 xfs_trans_unreserve_and_mod_sb(tp);
381
382 /* check for background commit before unlock */
383 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
384 push = 1;
385 up_read(&log->l_cilp->xc_ctx_lock);
386
387 /*
388 * We need to push CIL every so often so we don't cache more than we
389 * can fit in the log. The limit really is that a checkpoint can't be
390 * more than half the log (the current checkpoint is not allowed to
391 * overwrite the previous checkpoint), but commit latency and memory
392 * usage limit this to a smaller size in most cases.
393 */
394 if (push)
395 xlog_cil_push(log, 0);
396 return 0;
397}
398
399/*
400 * Mark all items committed and clear busy extents. We free the log vector
401 * chains in a separate pass so that we unpin the log items as quickly as
402 * possible.
403 */
404static void
405xlog_cil_committed(
406 void *args,
407 int abort)
408{
409 struct xfs_cil_ctx *ctx = args;
410 struct xfs_log_vec *lv;
411 int abortflag = abort ? XFS_LI_ABORTED : 0;
412 struct xfs_busy_extent *busyp, *n;
413
414 /* unpin all the log items */
415 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
416 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
417 abortflag);
418 }
419
420 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
421 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
422
423 spin_lock(&ctx->cil->xc_cil_lock);
424 list_del(&ctx->committing);
425 spin_unlock(&ctx->cil->xc_cil_lock);
426
427 xlog_cil_free_logvec(ctx->lv_chain);
428 kmem_free(ctx);
429}
430
431/*
432 * Push the Committed Item List to the log. If the push_now flag is not set,
433 * then it is a background flush and so we can chose to ignore it.
434 */
435int
436xlog_cil_push(
437 struct log *log,
438 int push_now)
439{
440 struct xfs_cil *cil = log->l_cilp;
441 struct xfs_log_vec *lv;
442 struct xfs_cil_ctx *ctx;
443 struct xfs_cil_ctx *new_ctx;
444 struct xlog_in_core *commit_iclog;
445 struct xlog_ticket *tic;
446 int num_lv;
447 int num_iovecs;
448 int len;
449 int error = 0;
450 struct xfs_trans_header thdr;
451 struct xfs_log_iovec lhdr;
452 struct xfs_log_vec lvhdr = { NULL };
453 xfs_lsn_t commit_lsn;
454
455 if (!cil)
456 return 0;
457
458 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
459 new_ctx->ticket = xlog_cil_ticket_alloc(log);
460
461 /* lock out transaction commit, but don't block on background push */
462 if (!down_write_trylock(&cil->xc_ctx_lock)) {
463 if (!push_now)
464 goto out_free_ticket;
465 down_write(&cil->xc_ctx_lock);
466 }
467 ctx = cil->xc_ctx;
468
469 /* check if we've anything to push */
470 if (list_empty(&cil->xc_cil))
471 goto out_skip;
472
473 /* check for spurious background flush */
474 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
475 goto out_skip;
476
477 /*
478 * pull all the log vectors off the items in the CIL, and
479 * remove the items from the CIL. We don't need the CIL lock
480 * here because it's only needed on the transaction commit
481 * side which is currently locked out by the flush lock.
482 */
483 lv = NULL;
484 num_lv = 0;
485 num_iovecs = 0;
486 len = 0;
487 while (!list_empty(&cil->xc_cil)) {
488 struct xfs_log_item *item;
489 int i;
490
491 item = list_first_entry(&cil->xc_cil,
492 struct xfs_log_item, li_cil);
493 list_del_init(&item->li_cil);
494 if (!ctx->lv_chain)
495 ctx->lv_chain = item->li_lv;
496 else
497 lv->lv_next = item->li_lv;
498 lv = item->li_lv;
499 item->li_lv = NULL;
500
501 num_lv++;
502 num_iovecs += lv->lv_niovecs;
503 for (i = 0; i < lv->lv_niovecs; i++)
504 len += lv->lv_iovecp[i].i_len;
505 }
506
507 /*
508 * initialise the new context and attach it to the CIL. Then attach
509 * the current context to the CIL committing lsit so it can be found
510 * during log forces to extract the commit lsn of the sequence that
511 * needs to be forced.
512 */
513 INIT_LIST_HEAD(&new_ctx->committing);
514 INIT_LIST_HEAD(&new_ctx->busy_extents);
515 new_ctx->sequence = ctx->sequence + 1;
516 new_ctx->cil = cil;
517 cil->xc_ctx = new_ctx;
518
519 /*
520 * The switch is now done, so we can drop the context lock and move out
521 * of a shared context. We can't just go straight to the commit record,
522 * though - we need to synchronise with previous and future commits so
523 * that the commit records are correctly ordered in the log to ensure
524 * that we process items during log IO completion in the correct order.
525 *
526 * For example, if we get an EFI in one checkpoint and the EFD in the
527 * next (e.g. due to log forces), we do not want the checkpoint with
528 * the EFD to be committed before the checkpoint with the EFI. Hence
529 * we must strictly order the commit records of the checkpoints so
530 * that: a) the checkpoint callbacks are attached to the iclogs in the
531 * correct order; and b) the checkpoints are replayed in correct order
532 * in log recovery.
533 *
534 * Hence we need to add this context to the committing context list so
535 * that higher sequences will wait for us to write out a commit record
536 * before they do.
537 */
538 spin_lock(&cil->xc_cil_lock);
539 list_add(&ctx->committing, &cil->xc_committing);
540 spin_unlock(&cil->xc_cil_lock);
541 up_write(&cil->xc_ctx_lock);
542
543 /*
544 * Build a checkpoint transaction header and write it to the log to
545 * begin the transaction. We need to account for the space used by the
546 * transaction header here as it is not accounted for in xlog_write().
547 *
548 * The LSN we need to pass to the log items on transaction commit is
549 * the LSN reported by the first log vector write. If we use the commit
550 * record lsn then we can move the tail beyond the grant write head.
551 */
552 tic = ctx->ticket;
553 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
554 thdr.th_type = XFS_TRANS_CHECKPOINT;
555 thdr.th_tid = tic->t_tid;
556 thdr.th_num_items = num_iovecs;
557 lhdr.i_addr = (xfs_caddr_t)&thdr;
558 lhdr.i_len = sizeof(xfs_trans_header_t);
559 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
560 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
561
562 lvhdr.lv_niovecs = 1;
563 lvhdr.lv_iovecp = &lhdr;
564 lvhdr.lv_next = ctx->lv_chain;
565
566 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
567 if (error)
568 goto out_abort;
569
570 /*
571 * now that we've written the checkpoint into the log, strictly
572 * order the commit records so replay will get them in the right order.
573 */
574restart:
575 spin_lock(&cil->xc_cil_lock);
576 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
577 /*
578 * Higher sequences will wait for this one so skip them.
579 * Don't wait for own own sequence, either.
580 */
581 if (new_ctx->sequence >= ctx->sequence)
582 continue;
583 if (!new_ctx->commit_lsn) {
584 /*
585 * It is still being pushed! Wait for the push to
586 * complete, then start again from the beginning.
587 */
588 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
589 goto restart;
590 }
591 }
592 spin_unlock(&cil->xc_cil_lock);
593
594 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
595 if (error || commit_lsn == -1)
596 goto out_abort;
597
598 /* attach all the transactions w/ busy extents to iclog */
599 ctx->log_cb.cb_func = xlog_cil_committed;
600 ctx->log_cb.cb_arg = ctx;
601 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
602 if (error)
603 goto out_abort;
604
605 /*
606 * now the checkpoint commit is complete and we've attached the
607 * callbacks to the iclog we can assign the commit LSN to the context
608 * and wake up anyone who is waiting for the commit to complete.
609 */
610 spin_lock(&cil->xc_cil_lock);
611 ctx->commit_lsn = commit_lsn;
612 sv_broadcast(&cil->xc_commit_wait);
613 spin_unlock(&cil->xc_cil_lock);
614
615 /* release the hounds! */
616 return xfs_log_release_iclog(log->l_mp, commit_iclog);
617
618out_skip:
619 up_write(&cil->xc_ctx_lock);
620out_free_ticket:
621 xfs_log_ticket_put(new_ctx->ticket);
622 kmem_free(new_ctx);
623 return 0;
624
625out_abort:
626 xlog_cil_committed(ctx, XFS_LI_ABORTED);
627 return XFS_ERROR(EIO);
628}
629
630/*
631 * Conditionally push the CIL based on the sequence passed in.
632 *
633 * We only need to push if we haven't already pushed the sequence
634 * number given. Hence the only time we will trigger a push here is
635 * if the push sequence is the same as the current context.
636 *
637 * We return the current commit lsn to allow the callers to determine if a
638 * iclog flush is necessary following this call.
639 *
640 * XXX: Initially, just push the CIL unconditionally and return whatever
641 * commit lsn is there. It'll be empty, so this is broken for now.
642 */
643xfs_lsn_t
644xlog_cil_push_lsn(
645 struct log *log,
646 xfs_lsn_t push_seq)
647{
648 struct xfs_cil *cil = log->l_cilp;
649 struct xfs_cil_ctx *ctx;
650 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
651
652restart:
653 down_write(&cil->xc_ctx_lock);
654 ASSERT(push_seq <= cil->xc_ctx->sequence);
655
656 /* check to see if we need to force out the current context */
657 if (push_seq == cil->xc_ctx->sequence) {
658 up_write(&cil->xc_ctx_lock);
659 xlog_cil_push(log, 1);
660 goto restart;
661 }
662
663 /*
664 * See if we can find a previous sequence still committing.
665 * We can drop the flush lock as soon as we have the cil lock
666 * because we are now only comparing contexts protected by
667 * the cil lock.
668 *
669 * We need to wait for all previous sequence commits to complete
670 * before allowing the force of push_seq to go ahead. Hence block
671 * on commits for those as well.
672 */
673 spin_lock(&cil->xc_cil_lock);
674 up_write(&cil->xc_ctx_lock);
675 list_for_each_entry(ctx, &cil->xc_committing, committing) {
676 if (ctx->sequence > push_seq)
677 continue;
678 if (!ctx->commit_lsn) {
679 /*
680 * It is still being pushed! Wait for the push to
681 * complete, then start again from the beginning.
682 */
683 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
684 goto restart;
685 }
686 if (ctx->sequence != push_seq)
687 continue;
688 /* found it! */
689 commit_lsn = ctx->commit_lsn;
690 }
691 spin_unlock(&cil->xc_cil_lock);
692 return commit_lsn;
693}
694
695/*
696 * Check if the current log item was first committed in this sequence.
697 * We can't rely on just the log item being in the CIL, we have to check
698 * the recorded commit sequence number.
699 *
700 * Note: for this to be used in a non-racy manner, it has to be called with
701 * CIL flushing locked out. As a result, it should only be used during the
702 * transaction commit process when deciding what to format into the item.
703 */
704bool
705xfs_log_item_in_current_chkpt(
706 struct xfs_log_item *lip)
707{
708 struct xfs_cil_ctx *ctx;
709
710 if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
711 return false;
712 if (list_empty(&lip->li_cil))
713 return false;
714
715 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
716
717 /*
718 * li_seq is written on the first commit of a log item to record the
719 * first checkpoint it is written to. Hence if it is different to the
720 * current sequence, we're in a new checkpoint.
721 */
722 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
723 return false;
724 return true;
725}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9cf695154451..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
154 shutdown */ 154 shutdown */
155typedef __uint32_t xlog_tid_t;
156
157 155
158#ifdef __KERNEL__ 156#ifdef __KERNEL__
159/* 157/*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
379} xlog_in_core_t; 377} xlog_in_core_t;
380 378
381/* 379/*
380 * The CIL context is used to aggregate per-transaction details as well be
381 * passed to the iclog for checkpoint post-commit processing. After being
382 * passed to the iclog, another context needs to be allocated for tracking the
383 * next set of transactions to be aggregated into a checkpoint.
384 */
385struct xfs_cil;
386
387struct xfs_cil_ctx {
388 struct xfs_cil *cil;
389 xfs_lsn_t sequence; /* chkpt sequence # */
390 xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
391 xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
392 struct xlog_ticket *ticket; /* chkpt ticket */
393 int nvecs; /* number of regions */
394 int space_used; /* aggregate size of regions */
395 struct list_head busy_extents; /* busy extents in chkpt */
396 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
397 xfs_log_callback_t log_cb; /* completion callback hook. */
398 struct list_head committing; /* ctx committing list */
399};
400
401/*
402 * Committed Item List structure
403 *
404 * This structure is used to track log items that have been committed but not
405 * yet written into the log. It is used only when the delayed logging mount
406 * option is enabled.
407 *
408 * This structure tracks the list of committing checkpoint contexts so
409 * we can avoid the problem of having to hold out new transactions during a
410 * flush until we have a the commit record LSN of the checkpoint. We can
411 * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
412 * sequence match and extract the commit LSN directly from there. If the
413 * checkpoint is still in the process of committing, we can block waiting for
414 * the commit LSN to be determined as well. This should make synchronous
415 * operations almost as efficient as the old logging methods.
416 */
417struct xfs_cil {
418 struct log *xc_log;
419 struct list_head xc_cil;
420 spinlock_t xc_cil_lock;
421 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing;
424 sv_t xc_commit_wait;
425};
426
427/*
428 * The amount of log space we should the CIL to aggregate is difficult to size.
429 * Whatever we chose we have to make we can get a reservation for the log space
430 * effectively, that it is large enough to capture sufficient relogging to
431 * reduce log buffer IO significantly, but it is not too large for the log or
432 * induces too much latency when writing out through the iclogs. We track both
433 * space consumed and the number of vectors in the checkpoint context, so we
434 * need to decide which to use for limiting.
435 *
436 * Every log buffer we write out during a push needs a header reserved, which
437 * is at least one sector and more for v2 logs. Hence we need a reservation of
438 * at least 512 bytes per 32k of log space just for the LR headers. That means
439 * 16KB of reservation per megabyte of delayed logging space we will consume,
440 * plus various headers. The number of headers will vary based on the num of
441 * io vectors, so limiting on a specific number of vectors is going to result
442 * in transactions of varying size. IOWs, it is more consistent to track and
443 * limit space consumed in the log rather than by the number of objects being
444 * logged in order to prevent checkpoint ticket overruns.
445 *
446 * Further, use of static reservations through the log grant mechanism is
447 * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
448 * grant) and a significant deadlock potential because regranting write space
449 * can block on log pushes. Hence if we have to regrant log space during a log
450 * push, we can deadlock.
451 *
452 * However, we can avoid this by use of a dynamic "reservation stealing"
453 * technique during transaction commit whereby unused reservation space in the
454 * transaction ticket is transferred to the CIL ctx commit ticket to cover the
455 * space needed by the checkpoint transaction. This means that we never need to
456 * specifically reserve space for the CIL checkpoint transaction, nor do we
457 * need to regrant space once the checkpoint completes. This also means the
458 * checkpoint transaction ticket is specific to the checkpoint context, rather
459 * than the CIL itself.
460 *
461 * With dynamic reservations, we can basically make up arbitrary limits for the
462 * checkpoint size so long as they don't violate any other size rules. Hence
463 * the initial maximum size for the checkpoint transaction will be set to a
464 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
465 * right now based on the latency of writing out a large amount of data through
466 * the circular iclog buffers.
467 */
468
469#define XLOG_CIL_SPACE_LIMIT(log) \
470 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
471
472/*
382 * The reservation head lsn is not made up of a cycle number and block number. 473 * The reservation head lsn is not made up of a cycle number and block number.
383 * Instead, it uses a cycle number and byte number. Logs don't expect to 474 * Instead, it uses a cycle number and byte number. Logs don't expect to
384 * overflow 31 bits worth of byte offset, so using a byte number will mean 475 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
388 /* The following fields don't need locking */ 479 /* The following fields don't need locking */
389 struct xfs_mount *l_mp; /* mount point */ 480 struct xfs_mount *l_mp; /* mount point */
390 struct xfs_ail *l_ailp; /* AIL log is working with */ 481 struct xfs_ail *l_ailp; /* AIL log is working with */
482 struct xfs_cil *l_cilp; /* CIL log is working with */
391 struct xfs_buf *l_xbuf; /* extra buffer for log 483 struct xfs_buf *l_xbuf; /* extra buffer for log
392 * wrapping */ 484 * wrapping */
393 struct xfs_buftarg *l_targ; /* buftarg of log */ 485 struct xfs_buftarg *l_targ; /* buftarg of log */
@@ -438,14 +530,17 @@ typedef struct log {
438 530
439#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 531#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
440 532
441
442/* common routines */ 533/* common routines */
443extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 534extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
444extern int xlog_recover(xlog_t *log); 535extern int xlog_recover(xlog_t *log);
445extern int xlog_recover_finish(xlog_t *log); 536extern int xlog_recover_finish(xlog_t *log);
446extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 537extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
447 538
448extern kmem_zone_t *xfs_log_ticket_zone; 539extern kmem_zone_t *xfs_log_ticket_zone;
540struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
541 int count, char client, uint xflags,
542 int alloc_flags);
543
449 544
450static inline void 545static inline void
451xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) 546xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
@@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
455 *off += bytes; 550 *off += bytes;
456} 551}
457 552
553void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
554int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
555 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
556 xlog_in_core_t **commit_iclog, uint flags);
557
558/*
559 * Committed Item List interfaces
560 */
561int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log);
564
565int xlog_cil_push(struct log *log, int push_now);
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
567
458/* 568/*
459 * Unmount record type is used as a pseudo transaction type for the ticket. 569 * Unmount record type is used as a pseudo transaction type for the ticket.
460 * It's value must be outside the range of XFS_TRANS_* values. 570 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0de08e366315..14a69aec2c0b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1576,7 +1576,7 @@ xlog_recover_reorder_trans(
1576 1576
1577 switch (ITEM_TYPE(item)) { 1577 switch (ITEM_TYPE(item)) {
1578 case XFS_LI_BUF: 1578 case XFS_LI_BUF:
1579 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { 1579 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1580 trace_xfs_log_recover_item_reorder_head(log, 1580 trace_xfs_log_recover_item_reorder_head(log,
1581 trans, item, pass); 1581 trans, item, pass);
1582 list_move(&item->ri_list, &trans->r_itemq); 1582 list_move(&item->ri_list, &trans->r_itemq);
@@ -1638,7 +1638,7 @@ xlog_recover_do_buffer_pass1(
1638 /* 1638 /*
1639 * If this isn't a cancel buffer item, then just return. 1639 * If this isn't a cancel buffer item, then just return.
1640 */ 1640 */
1641 if (!(flags & XFS_BLI_CANCEL)) { 1641 if (!(flags & XFS_BLF_CANCEL)) {
1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1643 return; 1643 return;
1644 } 1644 }
@@ -1696,7 +1696,7 @@ xlog_recover_do_buffer_pass1(
1696 * Check to see whether the buffer being recovered has a corresponding 1696 * Check to see whether the buffer being recovered has a corresponding
1697 * entry in the buffer cancel record table. If it does then return 1 1697 * entry in the buffer cancel record table. If it does then return 1
1698 * so that it will be cancelled, otherwise return 0. If the buffer is 1698 * so that it will be cancelled, otherwise return 0. If the buffer is
1699 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement 1699 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1700 * the refcount on the entry in the table and remove it from the table 1700 * the refcount on the entry in the table and remove it from the table
1701 * if this is the last reference. 1701 * if this is the last reference.
1702 * 1702 *
@@ -1721,7 +1721,7 @@ xlog_check_buffer_cancelled(
1721 * There is nothing in the table built in pass one, 1721 * There is nothing in the table built in pass one,
1722 * so this buffer must not be cancelled. 1722 * so this buffer must not be cancelled.
1723 */ 1723 */
1724 ASSERT(!(flags & XFS_BLI_CANCEL)); 1724 ASSERT(!(flags & XFS_BLF_CANCEL));
1725 return 0; 1725 return 0;
1726 } 1726 }
1727 1727
@@ -1733,7 +1733,7 @@ xlog_check_buffer_cancelled(
1733 * There is no corresponding entry in the table built 1733 * There is no corresponding entry in the table built
1734 * in pass one, so this buffer has not been cancelled. 1734 * in pass one, so this buffer has not been cancelled.
1735 */ 1735 */
1736 ASSERT(!(flags & XFS_BLI_CANCEL)); 1736 ASSERT(!(flags & XFS_BLF_CANCEL));
1737 return 0; 1737 return 0;
1738 } 1738 }
1739 1739
@@ -1752,7 +1752,7 @@ xlog_check_buffer_cancelled(
1752 * one in the table and remove it if this is the 1752 * one in the table and remove it if this is the
1753 * last reference. 1753 * last reference.
1754 */ 1754 */
1755 if (flags & XFS_BLI_CANCEL) { 1755 if (flags & XFS_BLF_CANCEL) {
1756 bcp->bc_refcount--; 1756 bcp->bc_refcount--;
1757 if (bcp->bc_refcount == 0) { 1757 if (bcp->bc_refcount == 0) {
1758 if (prevp == NULL) { 1758 if (prevp == NULL) {
@@ -1772,7 +1772,7 @@ xlog_check_buffer_cancelled(
1772 * We didn't find a corresponding entry in the table, so 1772 * We didn't find a corresponding entry in the table, so
1773 * return 0 so that the buffer is NOT cancelled. 1773 * return 0 so that the buffer is NOT cancelled.
1774 */ 1774 */
1775 ASSERT(!(flags & XFS_BLI_CANCEL)); 1775 ASSERT(!(flags & XFS_BLF_CANCEL));
1776 return 0; 1776 return 0;
1777} 1777}
1778 1778
@@ -1874,8 +1874,8 @@ xlog_recover_do_inode_buffer(
1874 nbits = xfs_contig_bits(data_map, map_size, 1874 nbits = xfs_contig_bits(data_map, map_size,
1875 bit); 1875 bit);
1876 ASSERT(nbits > 0); 1876 ASSERT(nbits > 0);
1877 reg_buf_offset = bit << XFS_BLI_SHIFT; 1877 reg_buf_offset = bit << XFS_BLF_SHIFT;
1878 reg_buf_bytes = nbits << XFS_BLI_SHIFT; 1878 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1879 item_index++; 1879 item_index++;
1880 } 1880 }
1881 1881
@@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer(
1889 } 1889 }
1890 1890
1891 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1891 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1892 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); 1892 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1893 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); 1893 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1894 1894
1895 /* 1895 /*
@@ -1955,9 +1955,9 @@ xlog_recover_do_reg_buffer(
1955 nbits = xfs_contig_bits(data_map, map_size, bit); 1955 nbits = xfs_contig_bits(data_map, map_size, bit);
1956 ASSERT(nbits > 0); 1956 ASSERT(nbits > 0);
1957 ASSERT(item->ri_buf[i].i_addr != NULL); 1957 ASSERT(item->ri_buf[i].i_addr != NULL);
1958 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); 1958 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1959 ASSERT(XFS_BUF_COUNT(bp) >= 1959 ASSERT(XFS_BUF_COUNT(bp) >=
1960 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); 1960 ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
1961 1961
1962 /* 1962 /*
1963 * Do a sanity check if this is a dquot buffer. Just checking 1963 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1966,7 +1966,7 @@ xlog_recover_do_reg_buffer(
1966 */ 1966 */
1967 error = 0; 1967 error = 0;
1968 if (buf_f->blf_flags & 1968 if (buf_f->blf_flags &
1969 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 1969 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1970 if (item->ri_buf[i].i_addr == NULL) { 1970 if (item->ri_buf[i].i_addr == NULL) {
1971 cmn_err(CE_ALERT, 1971 cmn_err(CE_ALERT,
1972 "XFS: NULL dquot in %s.", __func__); 1972 "XFS: NULL dquot in %s.", __func__);
@@ -1987,9 +1987,9 @@ xlog_recover_do_reg_buffer(
1987 } 1987 }
1988 1988
1989 memcpy(xfs_buf_offset(bp, 1989 memcpy(xfs_buf_offset(bp,
1990 (uint)bit << XFS_BLI_SHIFT), /* dest */ 1990 (uint)bit << XFS_BLF_SHIFT), /* dest */
1991 item->ri_buf[i].i_addr, /* source */ 1991 item->ri_buf[i].i_addr, /* source */
1992 nbits<<XFS_BLI_SHIFT); /* length */ 1992 nbits<<XFS_BLF_SHIFT); /* length */
1993 next: 1993 next:
1994 i++; 1994 i++;
1995 bit += nbits; 1995 bit += nbits;
@@ -2148,11 +2148,11 @@ xlog_recover_do_dquot_buffer(
2148 } 2148 }
2149 2149
2150 type = 0; 2150 type = 0;
2151 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) 2151 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2152 type |= XFS_DQ_USER; 2152 type |= XFS_DQ_USER;
2153 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) 2153 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2154 type |= XFS_DQ_PROJ; 2154 type |= XFS_DQ_PROJ;
2155 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) 2155 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2156 type |= XFS_DQ_GROUP; 2156 type |= XFS_DQ_GROUP;
2157 /* 2157 /*
2158 * This type of quotas was turned off, so ignore this buffer 2158 * This type of quotas was turned off, so ignore this buffer
@@ -2173,7 +2173,7 @@ xlog_recover_do_dquot_buffer(
2173 * here which overlaps that may be stale. 2173 * here which overlaps that may be stale.
2174 * 2174 *
2175 * When meta-data buffers are freed at run time we log a buffer item 2175 * When meta-data buffers are freed at run time we log a buffer item
2176 * with the XFS_BLI_CANCEL bit set to indicate that previous copies 2176 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2177 * of the buffer in the log should not be replayed at recovery time. 2177 * of the buffer in the log should not be replayed at recovery time.
2178 * This is so that if the blocks covered by the buffer are reused for 2178 * This is so that if the blocks covered by the buffer are reused for
2179 * file data before we crash we don't end up replaying old, freed 2179 * file data before we crash we don't end up replaying old, freed
@@ -2207,7 +2207,7 @@ xlog_recover_do_buffer_trans(
2207 if (pass == XLOG_RECOVER_PASS1) { 2207 if (pass == XLOG_RECOVER_PASS1) {
2208 /* 2208 /*
2209 * In this pass we're only looking for buf items 2209 * In this pass we're only looking for buf items
2210 * with the XFS_BLI_CANCEL bit set. 2210 * with the XFS_BLF_CANCEL bit set.
2211 */ 2211 */
2212 xlog_recover_do_buffer_pass1(log, buf_f); 2212 xlog_recover_do_buffer_pass1(log, buf_f);
2213 return 0; 2213 return 0;
@@ -2244,7 +2244,7 @@ xlog_recover_do_buffer_trans(
2244 2244
2245 mp = log->l_mp; 2245 mp = log->l_mp;
2246 buf_flags = XBF_LOCK; 2246 buf_flags = XBF_LOCK;
2247 if (!(flags & XFS_BLI_INODE_BUF)) 2247 if (!(flags & XFS_BLF_INODE_BUF))
2248 buf_flags |= XBF_MAPPED; 2248 buf_flags |= XBF_MAPPED;
2249 2249
2250 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2250 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2257,10 +2257,10 @@ xlog_recover_do_buffer_trans(
2257 } 2257 }
2258 2258
2259 error = 0; 2259 error = 0;
2260 if (flags & XFS_BLI_INODE_BUF) { 2260 if (flags & XFS_BLF_INODE_BUF) {
2261 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2261 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2262 } else if (flags & 2262 } else if (flags &
2263 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 2263 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2265 } else { 2265 } else {
2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
28#define XLOG_RHASH(tid) \ 28#define XLOG_RHASH(tid) \
29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) 29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
30 30
31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) 31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
32 32
33 33
34/* 34/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
269 must be synchronous except 269 must be synchronous except
270 for space allocations */ 270 for space allocations */
271#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
271#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 272#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
272#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 273#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
273#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 274#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be578ecb4af2..ce558efa2ea0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,6 +44,7 @@
44#include "xfs_trans_priv.h" 44#include "xfs_trans_priv.h"
45#include "xfs_trans_space.h" 45#include "xfs_trans_space.h"
46#include "xfs_inode_item.h" 46#include "xfs_inode_item.h"
47#include "xfs_trace.h"
47 48
48kmem_zone_t *xfs_trans_zone; 49kmem_zone_t *xfs_trans_zone;
49 50
@@ -243,9 +244,8 @@ _xfs_trans_alloc(
243 tp->t_type = type; 244 tp->t_type = type;
244 tp->t_mountp = mp; 245 tp->t_mountp = mp;
245 tp->t_items_free = XFS_LIC_NUM_SLOTS; 246 tp->t_items_free = XFS_LIC_NUM_SLOTS;
246 tp->t_busy_free = XFS_LBC_NUM_SLOTS;
247 xfs_lic_init(&(tp->t_items)); 247 xfs_lic_init(&(tp->t_items));
248 XFS_LBC_INIT(&(tp->t_busy)); 248 INIT_LIST_HEAD(&tp->t_busy);
249 return tp; 249 return tp;
250} 250}
251 251
@@ -255,8 +255,13 @@ _xfs_trans_alloc(
255 */ 255 */
256STATIC void 256STATIC void
257xfs_trans_free( 257xfs_trans_free(
258 xfs_trans_t *tp) 258 struct xfs_trans *tp)
259{ 259{
260 struct xfs_busy_extent *busyp, *n;
261
262 list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
263 xfs_alloc_busy_clear(tp->t_mountp, busyp);
264
260 atomic_dec(&tp->t_mountp->m_active_trans); 265 atomic_dec(&tp->t_mountp->m_active_trans);
261 xfs_trans_free_dqinfo(tp); 266 xfs_trans_free_dqinfo(tp);
262 kmem_zone_free(xfs_trans_zone, tp); 267 kmem_zone_free(xfs_trans_zone, tp);
@@ -285,9 +290,8 @@ xfs_trans_dup(
285 ntp->t_type = tp->t_type; 290 ntp->t_type = tp->t_type;
286 ntp->t_mountp = tp->t_mountp; 291 ntp->t_mountp = tp->t_mountp;
287 ntp->t_items_free = XFS_LIC_NUM_SLOTS; 292 ntp->t_items_free = XFS_LIC_NUM_SLOTS;
288 ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
289 xfs_lic_init(&(ntp->t_items)); 293 xfs_lic_init(&(ntp->t_items));
290 XFS_LBC_INIT(&(ntp->t_busy)); 294 INIT_LIST_HEAD(&ntp->t_busy);
291 295
292 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 296 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
293 ASSERT(tp->t_ticket != NULL); 297 ASSERT(tp->t_ticket != NULL);
@@ -423,7 +427,6 @@ undo_blocks:
423 return error; 427 return error;
424} 428}
425 429
426
427/* 430/*
428 * Record the indicated change to the given field for application 431 * Record the indicated change to the given field for application
429 * to the file system's superblock when the transaction commits. 432 * to the file system's superblock when the transaction commits.
@@ -652,7 +655,7 @@ xfs_trans_apply_sb_deltas(
652 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we 655 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
653 * still need to update the incore superblock with the changes. 656 * still need to update the incore superblock with the changes.
654 */ 657 */
655STATIC void 658void
656xfs_trans_unreserve_and_mod_sb( 659xfs_trans_unreserve_and_mod_sb(
657 xfs_trans_t *tp) 660 xfs_trans_t *tp)
658{ 661{
@@ -880,7 +883,7 @@ xfs_trans_fill_vecs(
880 * they could be immediately flushed and we'd have to race with the flusher 883 * they could be immediately flushed and we'd have to race with the flusher
881 * trying to pull the item from the AIL as we add it. 884 * trying to pull the item from the AIL as we add it.
882 */ 885 */
883static void 886void
884xfs_trans_item_committed( 887xfs_trans_item_committed(
885 struct xfs_log_item *lip, 888 struct xfs_log_item *lip,
886 xfs_lsn_t commit_lsn, 889 xfs_lsn_t commit_lsn,
@@ -930,26 +933,6 @@ xfs_trans_item_committed(
930 IOP_UNPIN(lip); 933 IOP_UNPIN(lip);
931} 934}
932 935
933/* Clear all the per-AG busy list items listed in this transaction */
934static void
935xfs_trans_clear_busy_extents(
936 struct xfs_trans *tp)
937{
938 xfs_log_busy_chunk_t *lbcp;
939 xfs_log_busy_slot_t *lbsp;
940 int i;
941
942 for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
943 i = 0;
944 for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
945 if (XFS_LBC_ISFREE(lbcp, i))
946 continue;
947 xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
948 }
949 }
950 xfs_trans_free_busy(tp);
951}
952
953/* 936/*
954 * This is typically called by the LM when a transaction has been fully 937 * This is typically called by the LM when a transaction has been fully
955 * committed to disk. It needs to unpin the items which have 938 * committed to disk. It needs to unpin the items which have
@@ -984,7 +967,6 @@ xfs_trans_committed(
984 kmem_free(licp); 967 kmem_free(licp);
985 } 968 }
986 969
987 xfs_trans_clear_busy_extents(tp);
988 xfs_trans_free(tp); 970 xfs_trans_free(tp);
989} 971}
990 972
@@ -1012,8 +994,7 @@ xfs_trans_uncommit(
1012 xfs_trans_unreserve_and_mod_sb(tp); 994 xfs_trans_unreserve_and_mod_sb(tp);
1013 xfs_trans_unreserve_and_mod_dquots(tp); 995 xfs_trans_unreserve_and_mod_dquots(tp);
1014 996
1015 xfs_trans_free_items(tp, flags); 997 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1016 xfs_trans_free_busy(tp);
1017 xfs_trans_free(tp); 998 xfs_trans_free(tp);
1018} 999}
1019 1000
@@ -1075,6 +1056,8 @@ xfs_trans_commit_iclog(
1075 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); 1056 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
1076 1057
1077 tp->t_commit_lsn = *commit_lsn; 1058 tp->t_commit_lsn = *commit_lsn;
1059 trace_xfs_trans_commit_lsn(tp);
1060
1078 if (nvec > XFS_TRANS_LOGVEC_COUNT) 1061 if (nvec > XFS_TRANS_LOGVEC_COUNT)
1079 kmem_free(log_vector); 1062 kmem_free(log_vector);
1080 1063
@@ -1161,6 +1144,93 @@ xfs_trans_commit_iclog(
1161 return xfs_log_release_iclog(mp, commit_iclog); 1144 return xfs_log_release_iclog(mp, commit_iclog);
1162} 1145}
1163 1146
1147/*
1148 * Walk the log items and allocate log vector structures for
1149 * each item large enough to fit all the vectors they require.
1150 * Note that this format differs from the old log vector format in
1151 * that there is no transaction header in these log vectors.
1152 */
1153STATIC struct xfs_log_vec *
1154xfs_trans_alloc_log_vecs(
1155 xfs_trans_t *tp)
1156{
1157 xfs_log_item_desc_t *lidp;
1158 struct xfs_log_vec *lv = NULL;
1159 struct xfs_log_vec *ret_lv = NULL;
1160
1161 lidp = xfs_trans_first_item(tp);
1162
1163 /* Bail out if we didn't find a log item. */
1164 if (!lidp) {
1165 ASSERT(0);
1166 return NULL;
1167 }
1168
1169 while (lidp != NULL) {
1170 struct xfs_log_vec *new_lv;
1171
1172 /* Skip items which aren't dirty in this transaction. */
1173 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1174 lidp = xfs_trans_next_item(tp, lidp);
1175 continue;
1176 }
1177
1178 /* Skip items that do not have any vectors for writing */
1179 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1180 if (!lidp->lid_size) {
1181 lidp = xfs_trans_next_item(tp, lidp);
1182 continue;
1183 }
1184
1185 new_lv = kmem_zalloc(sizeof(*new_lv) +
1186 lidp->lid_size * sizeof(struct xfs_log_iovec),
1187 KM_SLEEP);
1188
1189 /* The allocated iovec region lies beyond the log vector. */
1190 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
1191 new_lv->lv_niovecs = lidp->lid_size;
1192 new_lv->lv_item = lidp->lid_item;
1193 if (!ret_lv)
1194 ret_lv = new_lv;
1195 else
1196 lv->lv_next = new_lv;
1197 lv = new_lv;
1198 lidp = xfs_trans_next_item(tp, lidp);
1199 }
1200
1201 return ret_lv;
1202}
1203
1204static int
1205xfs_trans_commit_cil(
1206 struct xfs_mount *mp,
1207 struct xfs_trans *tp,
1208 xfs_lsn_t *commit_lsn,
1209 int flags)
1210{
1211 struct xfs_log_vec *log_vector;
1212 int error;
1213
1214 /*
1215 * Get each log item to allocate a vector structure for
1216 * the log item to to pass to the log write code. The
1217 * CIL commit code will format the vector and save it away.
1218 */
1219 log_vector = xfs_trans_alloc_log_vecs(tp);
1220 if (!log_vector)
1221 return ENOMEM;
1222
1223 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1224 if (error)
1225 return error;
1226
1227 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1228
1229 /* xfs_trans_free_items() unlocks them first */
1230 xfs_trans_free_items(tp, *commit_lsn, 0);
1231 xfs_trans_free(tp);
1232 return 0;
1233}
1164 1234
1165/* 1235/*
1166 * xfs_trans_commit 1236 * xfs_trans_commit
@@ -1221,7 +1291,11 @@ _xfs_trans_commit(
1221 xfs_trans_apply_sb_deltas(tp); 1291 xfs_trans_apply_sb_deltas(tp);
1222 xfs_trans_apply_dquot_deltas(tp); 1292 xfs_trans_apply_dquot_deltas(tp);
1223 1293
1224 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); 1294 if (mp->m_flags & XFS_MOUNT_DELAYLOG)
1295 error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
1296 else
1297 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
1298
1225 if (error == ENOMEM) { 1299 if (error == ENOMEM) {
1226 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1300 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1227 error = XFS_ERROR(EIO); 1301 error = XFS_ERROR(EIO);
@@ -1259,8 +1333,7 @@ out_unreserve:
1259 error = XFS_ERROR(EIO); 1333 error = XFS_ERROR(EIO);
1260 } 1334 }
1261 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1335 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1262 xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0); 1336 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
1263 xfs_trans_free_busy(tp);
1264 xfs_trans_free(tp); 1337 xfs_trans_free(tp);
1265 1338
1266 XFS_STATS_INC(xs_trans_empty); 1339 XFS_STATS_INC(xs_trans_empty);
@@ -1338,8 +1411,7 @@ xfs_trans_cancel(
1338 /* mark this thread as no longer being in a transaction */ 1411 /* mark this thread as no longer being in a transaction */
1339 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1412 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1340 1413
1341 xfs_trans_free_items(tp, flags); 1414 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1342 xfs_trans_free_busy(tp);
1343 xfs_trans_free(tp); 1415 xfs_trans_free(tp);
1344} 1416}
1345 1417
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c62beee0921e..8c69e7824f68 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -106,7 +106,8 @@ typedef struct xfs_trans_header {
106#define XFS_TRANS_GROWFSRT_FREE 39 106#define XFS_TRANS_GROWFSRT_FREE 39
107#define XFS_TRANS_SWAPEXT 40 107#define XFS_TRANS_SWAPEXT 40
108#define XFS_TRANS_SB_COUNT 41 108#define XFS_TRANS_SB_COUNT 41
109#define XFS_TRANS_TYPE_MAX 41 109#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42
110/* new transaction types need to be reflected in xfs_logprint(8) */ 111/* new transaction types need to be reflected in xfs_logprint(8) */
111 112
112#define XFS_TRANS_TYPES \ 113#define XFS_TRANS_TYPES \
@@ -148,6 +149,7 @@ typedef struct xfs_trans_header {
148 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 149 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
149 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 150 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
150 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ 151 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
152 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
151 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 153 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
152 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 154 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
153 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 155 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
@@ -813,6 +815,7 @@ struct xfs_log_item_desc;
813struct xfs_mount; 815struct xfs_mount;
814struct xfs_trans; 816struct xfs_trans;
815struct xfs_dquot_acct; 817struct xfs_dquot_acct;
818struct xfs_busy_extent;
816 819
817typedef struct xfs_log_item { 820typedef struct xfs_log_item {
818 struct list_head li_ail; /* AIL pointers */ 821 struct list_head li_ail; /* AIL pointers */
@@ -828,6 +831,11 @@ typedef struct xfs_log_item {
828 /* buffer item iodone */ 831 /* buffer item iodone */
829 /* callback func */ 832 /* callback func */
830 struct xfs_item_ops *li_ops; /* function list */ 833 struct xfs_item_ops *li_ops; /* function list */
834
835 /* delayed logging */
836 struct list_head li_cil; /* CIL pointers */
837 struct xfs_log_vec *li_lv; /* active log vector */
838 xfs_lsn_t li_seq; /* CIL commit seq */
831} xfs_log_item_t; 839} xfs_log_item_t;
832 840
833#define XFS_LI_IN_AIL 0x1 841#define XFS_LI_IN_AIL 0x1
@@ -872,34 +880,6 @@ typedef struct xfs_item_ops {
872#define XFS_ITEM_PUSHBUF 3 880#define XFS_ITEM_PUSHBUF 3
873 881
874/* 882/*
875 * This structure is used to maintain a list of block ranges that have been
876 * freed in the transaction. The ranges are listed in the perag[] busy list
877 * between when they're freed and the transaction is committed to disk.
878 */
879
880typedef struct xfs_log_busy_slot {
881 xfs_agnumber_t lbc_ag;
882 ushort lbc_idx; /* index in perag.busy[] */
883} xfs_log_busy_slot_t;
884
885#define XFS_LBC_NUM_SLOTS 31
886typedef struct xfs_log_busy_chunk {
887 struct xfs_log_busy_chunk *lbc_next;
888 uint lbc_free; /* free slots bitmask */
889 ushort lbc_unused; /* first unused */
890 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
891} xfs_log_busy_chunk_t;
892
893#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
894#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
895
896#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
897#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
898#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
899#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
900#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
901
902/*
903 * This is the type of function which can be given to xfs_trans_callback() 883 * This is the type of function which can be given to xfs_trans_callback()
904 * to be called upon the transaction's commit to disk. 884 * to be called upon the transaction's commit to disk.
905 */ 885 */
@@ -950,8 +930,7 @@ typedef struct xfs_trans {
950 unsigned int t_items_free; /* log item descs free */ 930 unsigned int t_items_free; /* log item descs free */
951 xfs_log_item_chunk_t t_items; /* first log item desc chunk */ 931 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
952 xfs_trans_header_t t_header; /* header for in-log trans */ 932 xfs_trans_header_t t_header; /* header for in-log trans */
953 unsigned int t_busy_free; /* busy descs free */ 933 struct list_head t_busy; /* list of busy extents */
954 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
955 unsigned long t_pflags; /* saved process flags state */ 934 unsigned long t_pflags; /* saved process flags state */
956} xfs_trans_t; 935} xfs_trans_t;
957 936
@@ -1025,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *,
1025void xfs_trans_cancel(xfs_trans_t *, int); 1004void xfs_trans_cancel(xfs_trans_t *, int);
1026int xfs_trans_ail_init(struct xfs_mount *); 1005int xfs_trans_ail_init(struct xfs_mount *);
1027void xfs_trans_ail_destroy(struct xfs_mount *); 1006void xfs_trans_ail_destroy(struct xfs_mount *);
1028xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
1029 xfs_agnumber_t ag,
1030 xfs_extlen_t idx);
1031 1007
1032extern kmem_zone_t *xfs_trans_zone; 1008extern kmem_zone_t *xfs_trans_zone;
1033 1009
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 9cd809025f3a..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -114,7 +114,7 @@ _xfs_trans_bjoin(
114 xfs_buf_item_init(bp, tp->t_mountp); 114 xfs_buf_item_init(bp, tp->t_mountp);
115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); 118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
119 if (reset_recur) 119 if (reset_recur)
120 bip->bli_recur = 0; 120 bip->bli_recur = 0;
@@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
515 ASSERT(atomic_read(&bip->bli_refcount) > 0); 515 ASSERT(atomic_read(&bip->bli_refcount) > 0);
516 516
517 /* 517 /*
@@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
619 619
620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
623 ASSERT(atomic_read(&bip->bli_refcount) > 0); 623 ASSERT(atomic_read(&bip->bli_refcount) > 0);
624 bip->bli_flags |= XFS_BLI_HOLD; 624 bip->bli_flags |= XFS_BLI_HOLD;
625 trace_xfs_trans_bhold(bip); 625 trace_xfs_trans_bhold(bip);
@@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
641 641
642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
645 ASSERT(atomic_read(&bip->bli_refcount) > 0); 645 ASSERT(atomic_read(&bip->bli_refcount) > 0);
646 ASSERT(bip->bli_flags & XFS_BLI_HOLD); 646 ASSERT(bip->bli_flags & XFS_BLI_HOLD);
647 bip->bli_flags &= ~XFS_BLI_HOLD; 647 bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
704 bip->bli_flags &= ~XFS_BLI_STALE; 704 bip->bli_flags &= ~XFS_BLI_STALE;
705 ASSERT(XFS_BUF_ISSTALE(bp)); 705 ASSERT(XFS_BUF_ISSTALE(bp));
706 XFS_BUF_UNSTALE(bp); 706 XFS_BUF_UNSTALE(bp);
707 bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; 707 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
708 } 708 }
709 709
710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); 710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -762,8 +762,8 @@ xfs_trans_binval(
762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
763 ASSERT(XFS_BUF_ISSTALE(bp)); 763 ASSERT(XFS_BUF_ISSTALE(bp));
764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); 765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
766 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 766 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY); 767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
769 return; 769 return;
@@ -774,7 +774,7 @@ xfs_trans_binval(
774 * in the buf log item. The STALE flag will be used in 774 * in the buf log item. The STALE flag will be used in
775 * xfs_buf_item_unpin() to determine if it should clean up 775 * xfs_buf_item_unpin() to determine if it should clean up
776 * when the last reference to the buf item is given up. 776 * when the last reference to the buf item is given up.
777 * We set the XFS_BLI_CANCEL flag in the buf log format structure 777 * We set the XFS_BLF_CANCEL flag in the buf log format structure
778 * and log the buf item. This will be used at recovery time 778 * and log the buf item. This will be used at recovery time
779 * to determine that copies of the buffer in the log before 779 * to determine that copies of the buffer in the log before
780 * this should not be replayed. 780 * this should not be replayed.
@@ -792,9 +792,9 @@ xfs_trans_binval(
792 XFS_BUF_UNDELAYWRITE(bp); 792 XFS_BUF_UNDELAYWRITE(bp);
793 XFS_BUF_STALE(bp); 793 XFS_BUF_STALE(bp);
794 bip->bli_flags |= XFS_BLI_STALE; 794 bip->bli_flags |= XFS_BLI_STALE;
795 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); 795 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
796 bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; 796 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
797 bip->bli_format.blf_flags |= XFS_BLI_CANCEL; 797 bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
798 memset((char *)(bip->bli_format.blf_data_map), 0, 798 memset((char *)(bip->bli_format.blf_data_map), 0,
799 (bip->bli_format.blf_map_size * sizeof(uint))); 799 (bip->bli_format.blf_map_size * sizeof(uint)));
800 lidp->lid_flags |= XFS_LID_DIRTY; 800 lidp->lid_flags |= XFS_LID_DIRTY;
@@ -802,16 +802,16 @@ xfs_trans_binval(
802} 802}
803 803
804/* 804/*
805 * This call is used to indicate that the buffer contains on-disk 805 * This call is used to indicate that the buffer contains on-disk inodes which
806 * inodes which must be handled specially during recovery. They 806 * must be handled specially during recovery. They require special handling
807 * require special handling because only the di_next_unlinked from 807 * because only the di_next_unlinked from the inodes in the buffer should be
808 * the inodes in the buffer should be recovered. The rest of the 808 * recovered. The rest of the data in the buffer is logged via the inodes
809 * data in the buffer is logged via the inodes themselves. 809 * themselves.
810 * 810 *
811 * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log 811 * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
812 * format structure so that we'll know what to do at recovery time. 812 * transferred to the buffer's log format structure so that we'll know what to
813 * do at recovery time.
813 */ 814 */
814/* ARGSUSED */
815void 815void
816xfs_trans_inode_buf( 816xfs_trans_inode_buf(
817 xfs_trans_t *tp, 817 xfs_trans_t *tp,
@@ -826,7 +826,7 @@ xfs_trans_inode_buf(
826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
827 ASSERT(atomic_read(&bip->bli_refcount) > 0); 827 ASSERT(atomic_read(&bip->bli_refcount) > 0);
828 828
829 bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; 829 bip->bli_flags |= XFS_BLI_INODE_BUF;
830} 830}
831 831
832/* 832/*
@@ -908,9 +908,9 @@ xfs_trans_dquot_buf(
908 ASSERT(XFS_BUF_ISBUSY(bp)); 908 ASSERT(XFS_BUF_ISBUSY(bp));
909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
911 ASSERT(type == XFS_BLI_UDQUOT_BUF || 911 ASSERT(type == XFS_BLF_UDQUOT_BUF ||
912 type == XFS_BLI_PDQUOT_BUF || 912 type == XFS_BLF_PDQUOT_BUF ||
913 type == XFS_BLI_GDQUOT_BUF); 913 type == XFS_BLF_GDQUOT_BUF);
914 914
915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
916 ASSERT(atomic_read(&bip->bli_refcount) > 0); 916 ASSERT(atomic_read(&bip->bli_refcount) > 0);
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
299void 299void
300xfs_trans_free_items( 300xfs_trans_free_items(
301 xfs_trans_t *tp, 301 xfs_trans_t *tp,
302 xfs_lsn_t commit_lsn,
302 int flags) 303 int flags)
303{ 304{
304 xfs_log_item_chunk_t *licp; 305 xfs_log_item_chunk_t *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
311 * Special case the embedded chunk so we don't free it below. 312 * Special case the embedded chunk so we don't free it below.
312 */ 313 */
313 if (!xfs_lic_are_all_free(licp)) { 314 if (!xfs_lic_are_all_free(licp)) {
314 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 315 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
315 xfs_lic_all_free(licp); 316 xfs_lic_all_free(licp);
316 licp->lic_unused = 0; 317 licp->lic_unused = 0;
317 } 318 }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
322 */ 323 */
323 while (licp != NULL) { 324 while (licp != NULL) {
324 ASSERT(!xfs_lic_are_all_free(licp)); 325 ASSERT(!xfs_lic_are_all_free(licp));
325 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 326 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
326 next_licp = licp->lic_next; 327 next_licp = licp->lic_next;
327 kmem_free(licp); 328 kmem_free(licp);
328 licp = next_licp; 329 licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
438 439
439 return freed; 440 return freed;
440} 441}
441
442
443/*
444 * This is called to add the given busy item to the transaction's
445 * list of busy items. It must find a free busy item descriptor
446 * or allocate a new one and add the item to that descriptor.
447 * The function returns a pointer to busy descriptor used to point
448 * to the new busy entry. The log busy entry will now point to its new
449 * descriptor with its ???? field.
450 */
451xfs_log_busy_slot_t *
452xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
453{
454 xfs_log_busy_chunk_t *lbcp;
455 xfs_log_busy_slot_t *lbsp;
456 int i=0;
457
458 /*
459 * If there are no free descriptors, allocate a new chunk
460 * of them and put it at the front of the chunk list.
461 */
462 if (tp->t_busy_free == 0) {
463 lbcp = (xfs_log_busy_chunk_t*)
464 kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
465 ASSERT(lbcp != NULL);
466 /*
467 * Initialize the chunk, and then
468 * claim the first slot in the newly allocated chunk.
469 */
470 XFS_LBC_INIT(lbcp);
471 XFS_LBC_CLAIM(lbcp, 0);
472 lbcp->lbc_unused = 1;
473 lbsp = XFS_LBC_SLOT(lbcp, 0);
474
475 /*
476 * Link in the new chunk and update the free count.
477 */
478 lbcp->lbc_next = tp->t_busy.lbc_next;
479 tp->t_busy.lbc_next = lbcp;
480 tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
481
482 /*
483 * Initialize the descriptor and the generic portion
484 * of the log item.
485 *
486 * Point the new slot at this item and return it.
487 * Also point the log item at its currently active
488 * descriptor and set the item's mount pointer.
489 */
490 lbsp->lbc_ag = ag;
491 lbsp->lbc_idx = idx;
492 return lbsp;
493 }
494
495 /*
496 * Find the free descriptor. It is somewhere in the chunklist
497 * of descriptors.
498 */
499 lbcp = &tp->t_busy;
500 while (lbcp != NULL) {
501 if (XFS_LBC_VACANCY(lbcp)) {
502 if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
503 i = lbcp->lbc_unused;
504 break;
505 } else {
506 /* out-of-order vacancy */
507 cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
508 ASSERT(0);
509 }
510 }
511 lbcp = lbcp->lbc_next;
512 }
513 ASSERT(lbcp != NULL);
514 /*
515 * If we find a free descriptor, claim it,
516 * initialize it, and return it.
517 */
518 XFS_LBC_CLAIM(lbcp, i);
519 if (lbcp->lbc_unused <= i) {
520 lbcp->lbc_unused = i + 1;
521 }
522 lbsp = XFS_LBC_SLOT(lbcp, i);
523 tp->t_busy_free--;
524 lbsp->lbc_ag = ag;
525 lbsp->lbc_idx = idx;
526 return lbsp;
527}
528
529
530/*
531 * xfs_trans_free_busy
532 * Free all of the busy lists from a transaction
533 */
534void
535xfs_trans_free_busy(xfs_trans_t *tp)
536{
537 xfs_log_busy_chunk_t *lbcp;
538 xfs_log_busy_chunk_t *lbcq;
539
540 lbcp = tp->t_busy.lbc_next;
541 while (lbcp != NULL) {
542 lbcq = lbcp->lbc_next;
543 kmem_free(lbcp);
544 lbcp = lbcq;
545 }
546
547 XFS_LBC_INIT(&tp->t_busy);
548 tp->t_busy.lbc_unused = 0;
549}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); 35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, 36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
37 struct xfs_log_item_desc *); 37 struct xfs_log_item_desc *);
38void xfs_trans_free_items(struct xfs_trans *, int); 38
39void xfs_trans_unlock_items(struct xfs_trans *, 39void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
40 xfs_lsn_t); 40void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
41void xfs_trans_free_busy(xfs_trans_t *tp); 41 int flags);
42xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, 42
43 xfs_agnumber_t ag, 43void xfs_trans_item_committed(struct xfs_log_item *lip,
44 xfs_extlen_t idx); 44 xfs_lsn_t commit_lsn, int aborted);
45void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
45 46
46/* 47/*
47 * AIL traversal cursor. 48 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ 76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77 77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */
79
78/* 80/*
79 * These types are 64 bits on disk but are either 32 or 64 bits in memory. 81 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
80 * Disk based types: 82 * Disk based types: